blob: 7979eec84562f4452f3100aac08f88dd4dc05c56 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001051 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001438 Py_MEMCPY((char*)to_data + to_kind * to_start,
1439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Victor Stinnerd3f08822012-05-29 12:57:52 +02001552 if (from_start < 0) {
1553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
1556 if (to_start < 0) {
1557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001560 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1561 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1562 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001563 "Cannot write %zi characters at %zi "
1564 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 how_many, to_start, PyUnicode_GET_LENGTH(to));
1566 return -1;
1567 }
1568
1569 if (how_many == 0)
1570 return 0;
1571
Victor Stinner488fa492011-12-12 00:01:39 +01001572 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001573 return -1;
1574
1575 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1576 if (err) {
1577 PyErr_Format(PyExc_SystemError,
1578 "Cannot copy %s characters "
1579 "into a string of %s characters",
1580 unicode_kind_name(from),
1581 unicode_kind_name(to));
1582 return -1;
1583 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585}
1586
Victor Stinner17222162011-09-28 22:15:37 +02001587/* Find the maximum code point and count the number of surrogate pairs so a
1588 correct string length can be computed before converting a string to UCS4.
1589 This function counts single surrogates as a character and not as a pair.
1590
1591 Return 0 on success, or -1 on error. */
1592static int
1593find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1594 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595{
1596 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001597 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598
Victor Stinnerc53be962011-10-02 21:33:54 +02001599 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 *num_surrogates = 0;
1601 *maxchar = 0;
1602
1603 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001605 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1606 && (iter+1) < end
1607 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1608 {
1609 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1610 ++(*num_surrogates);
1611 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 }
1613 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001615 {
1616 ch = *iter;
1617 iter++;
1618 }
1619 if (ch > *maxchar) {
1620 *maxchar = ch;
1621 if (*maxchar > MAX_UNICODE) {
1622 PyErr_Format(PyExc_ValueError,
1623 "character U+%x is not in range [U+0000; U+10ffff]",
1624 ch);
1625 return -1;
1626 }
1627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628 }
1629 return 0;
1630}
1631
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001632int
1633_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634{
1635 wchar_t *end;
1636 Py_UCS4 maxchar = 0;
1637 Py_ssize_t num_surrogates;
1638#if SIZEOF_WCHAR_T == 2
1639 Py_ssize_t length_wo_surrogates;
1640#endif
1641
Georg Brandl7597add2011-10-05 16:36:47 +02001642 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001643 strings were created using _PyObject_New() and where no canonical
1644 representation (the str field) has been set yet aka strings
1645 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001646 assert(_PyUnicode_CHECK(unicode));
1647 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001649 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001650 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001651 /* Actually, it should neither be interned nor be anything else: */
1652 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001655 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001656 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658
1659 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001660 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1661 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 PyErr_NoMemory();
1663 return -1;
1664 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001665 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 _PyUnicode_WSTR(unicode), end,
1667 PyUnicode_1BYTE_DATA(unicode));
1668 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1669 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1670 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1671 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001672 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001673 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001674 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 }
1676 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001677 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8(unicode) = NULL;
1679 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 }
1681 PyObject_FREE(_PyUnicode_WSTR(unicode));
1682 _PyUnicode_WSTR(unicode) = NULL;
1683 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1684 }
1685 /* In this case we might have to convert down from 4-byte native
1686 wchar_t to 2-byte unicode. */
1687 else if (maxchar < 65536) {
1688 assert(num_surrogates == 0 &&
1689 "FindMaxCharAndNumSurrogatePairs() messed up");
1690
Victor Stinner506f5922011-09-28 22:34:18 +02001691#if SIZEOF_WCHAR_T == 2
1692 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001694 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1695 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1696 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001699#else
1700 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001701 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001702 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001704 PyErr_NoMemory();
1705 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 }
Victor Stinner506f5922011-09-28 22:34:18 +02001707 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1708 _PyUnicode_WSTR(unicode), end,
1709 PyUnicode_2BYTE_DATA(unicode));
1710 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1711 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1712 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001713 _PyUnicode_UTF8(unicode) = NULL;
1714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001715 PyObject_FREE(_PyUnicode_WSTR(unicode));
1716 _PyUnicode_WSTR(unicode) = NULL;
1717 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1718#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 }
1720 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1721 else {
1722#if SIZEOF_WCHAR_T == 2
1723 /* in case the native representation is 2-bytes, we need to allocate a
1724 new normalized 4-byte version. */
1725 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001726 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1727 PyErr_NoMemory();
1728 return -1;
1729 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1731 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 PyErr_NoMemory();
1733 return -1;
1734 }
1735 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1736 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001737 _PyUnicode_UTF8(unicode) = NULL;
1738 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001739 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1740 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001741 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 PyObject_FREE(_PyUnicode_WSTR(unicode));
1743 _PyUnicode_WSTR(unicode) = NULL;
1744 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1745#else
1746 assert(num_surrogates == 0);
1747
Victor Stinnerc3c74152011-10-02 20:39:55 +02001748 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001750 _PyUnicode_UTF8(unicode) = NULL;
1751 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1753#endif
1754 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1755 }
1756 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001757 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 return 0;
1759}
1760
Alexander Belopolsky40018472011-02-26 01:02:56 +00001761static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001762unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763{
Walter Dörwald16807132007-05-25 13:52:07 +00001764 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001765 case SSTATE_NOT_INTERNED:
1766 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001767
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 case SSTATE_INTERNED_MORTAL:
1769 /* revive dead object temporarily for DelItem */
1770 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001771 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 Py_FatalError(
1773 "deletion of interned string failed");
1774 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001775
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 case SSTATE_INTERNED_IMMORTAL:
1777 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001778
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 default:
1780 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001781 }
1782
Victor Stinner03490912011-10-03 23:45:12 +02001783 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001785 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001787 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1788 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001790 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791}
1792
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001793#ifdef Py_DEBUG
1794static int
1795unicode_is_singleton(PyObject *unicode)
1796{
1797 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1798 if (unicode == unicode_empty)
1799 return 1;
1800 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1801 {
1802 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1803 if (ch < 256 && unicode_latin1[ch] == unicode)
1804 return 1;
1805 }
1806 return 0;
1807}
1808#endif
1809
Alexander Belopolsky40018472011-02-26 01:02:56 +00001810static int
Victor Stinner488fa492011-12-12 00:01:39 +01001811unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001812{
Victor Stinner488fa492011-12-12 00:01:39 +01001813 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001814 if (Py_REFCNT(unicode) != 1)
1815 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001816 if (_PyUnicode_HASH(unicode) != -1)
1817 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (PyUnicode_CHECK_INTERNED(unicode))
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (!PyUnicode_CheckExact(unicode))
1821 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001822#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001823 /* singleton refcount is greater than 1 */
1824 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001825#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001826 return 1;
1827}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001828
Victor Stinnerfe226c02011-10-03 03:52:20 +02001829static int
1830unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1831{
1832 PyObject *unicode;
1833 Py_ssize_t old_length;
1834
1835 assert(p_unicode != NULL);
1836 unicode = *p_unicode;
1837
1838 assert(unicode != NULL);
1839 assert(PyUnicode_Check(unicode));
1840 assert(0 <= length);
1841
Victor Stinner910337b2011-10-03 03:20:16 +02001842 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001843 old_length = PyUnicode_WSTR_LENGTH(unicode);
1844 else
1845 old_length = PyUnicode_GET_LENGTH(unicode);
1846 if (old_length == length)
1847 return 0;
1848
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001849 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001850 _Py_INCREF_UNICODE_EMPTY();
1851 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001853 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001854 return 0;
1855 }
1856
Victor Stinner488fa492011-12-12 00:01:39 +01001857 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 PyObject *copy = resize_copy(unicode, length);
1859 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001860 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001861 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001863 }
1864
Victor Stinnerfe226c02011-10-03 03:52:20 +02001865 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001866 PyObject *new_unicode = resize_compact(unicode, length);
1867 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001871 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001872 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873}
1874
Alexander Belopolsky40018472011-02-26 01:02:56 +00001875int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001876PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001877{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001878 PyObject *unicode;
1879 if (p_unicode == NULL) {
1880 PyErr_BadInternalCall();
1881 return -1;
1882 }
1883 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001884 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001885 {
1886 PyErr_BadInternalCall();
1887 return -1;
1888 }
1889 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001890}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001891
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001892/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001893
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001894 WARNING: The function doesn't copy the terminating null character and
1895 doesn't check the maximum character (may write a latin1 character in an
1896 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001897static void
1898unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1899 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001900{
1901 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1902 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001903 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001904
1905 switch (kind) {
1906 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001907 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001908#ifdef Py_DEBUG
1909 if (PyUnicode_IS_ASCII(unicode)) {
1910 Py_UCS4 maxchar = ucs1lib_find_max_char(
1911 (const Py_UCS1*)str,
1912 (const Py_UCS1*)str + len);
1913 assert(maxchar < 128);
1914 }
1915#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001916 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001917 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001918 }
1919 case PyUnicode_2BYTE_KIND: {
1920 Py_UCS2 *start = (Py_UCS2 *)data + index;
1921 Py_UCS2 *ucs2 = start;
1922 assert(index <= PyUnicode_GET_LENGTH(unicode));
1923
Victor Stinner184252a2012-06-16 02:57:41 +02001924 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001925 *ucs2 = (Py_UCS2)*str;
1926
1927 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001928 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 }
1930 default: {
1931 Py_UCS4 *start = (Py_UCS4 *)data + index;
1932 Py_UCS4 *ucs4 = start;
1933 assert(kind == PyUnicode_4BYTE_KIND);
1934 assert(index <= PyUnicode_GET_LENGTH(unicode));
1935
Victor Stinner184252a2012-06-16 02:57:41 +02001936 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 *ucs4 = (Py_UCS4)*str;
1938
1939 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001940 }
1941 }
1942}
1943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944static PyObject*
1945get_latin1_char(unsigned char ch)
1946{
Victor Stinnera464fc12011-10-02 20:39:30 +02001947 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001949 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (!unicode)
1951 return NULL;
1952 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001953 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 unicode_latin1[ch] = unicode;
1955 }
1956 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001957 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958}
1959
Victor Stinner985a82a2014-01-03 12:53:47 +01001960static PyObject*
1961unicode_char(Py_UCS4 ch)
1962{
1963 PyObject *unicode;
1964
1965 assert(ch <= MAX_UNICODE);
1966
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001967 if (ch < 256)
1968 return get_latin1_char(ch);
1969
Victor Stinner985a82a2014-01-03 12:53:47 +01001970 unicode = PyUnicode_New(1, ch);
1971 if (unicode == NULL)
1972 return NULL;
1973 switch (PyUnicode_KIND(unicode)) {
1974 case PyUnicode_1BYTE_KIND:
1975 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1976 break;
1977 case PyUnicode_2BYTE_KIND:
1978 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1979 break;
1980 default:
1981 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1982 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1983 }
1984 assert(_PyUnicode_CheckConsistency(unicode, 1));
1985 return unicode;
1986}
1987
Alexander Belopolsky40018472011-02-26 01:02:56 +00001988PyObject *
1989PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001991 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 Py_UCS4 maxchar = 0;
1993 Py_ssize_t num_surrogates;
1994
1995 if (u == NULL)
1996 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001998 /* If the Unicode data is known at construction time, we can apply
1999 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002002 if (size == 0)
2003 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Single character Unicode objects in the Latin-1 range are
2006 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002007 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return get_latin1_char((unsigned char)*u);
2009
2010 /* If not empty and not single character, copy the Unicode data
2011 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002012 if (find_maxchar_surrogates(u, u + size,
2013 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 return NULL;
2015
Victor Stinner8faf8212011-12-08 22:14:11 +01002016 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 if (!unicode)
2018 return NULL;
2019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 switch (PyUnicode_KIND(unicode)) {
2021 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002022 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2024 break;
2025 case PyUnicode_2BYTE_KIND:
2026#if Py_UNICODE_SIZE == 2
2027 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2028#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002029 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2031#endif
2032 break;
2033 case PyUnicode_4BYTE_KIND:
2034#if SIZEOF_WCHAR_T == 2
2035 /* This is the only case which has to process surrogates, thus
2036 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002037 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038#else
2039 assert(num_surrogates == 0);
2040 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2041#endif
2042 break;
2043 default:
2044 assert(0 && "Impossible state");
2045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002047 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048}
2049
Alexander Belopolsky40018472011-02-26 01:02:56 +00002050PyObject *
2051PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002052{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002053 if (size < 0) {
2054 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002055 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 return NULL;
2057 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002058 if (u != NULL)
2059 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2060 else
2061 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002062}
2063
Alexander Belopolsky40018472011-02-26 01:02:56 +00002064PyObject *
2065PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002066{
2067 size_t size = strlen(u);
2068 if (size > PY_SSIZE_T_MAX) {
2069 PyErr_SetString(PyExc_OverflowError, "input too long");
2070 return NULL;
2071 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002072 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002073}
2074
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002075PyObject *
2076_PyUnicode_FromId(_Py_Identifier *id)
2077{
2078 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002079 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2080 strlen(id->string),
2081 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002082 if (!id->object)
2083 return NULL;
2084 PyUnicode_InternInPlace(&id->object);
2085 assert(!id->next);
2086 id->next = static_strings;
2087 static_strings = id;
2088 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002089 return id->object;
2090}
2091
2092void
2093_PyUnicode_ClearStaticStrings()
2094{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002095 _Py_Identifier *tmp, *s = static_strings;
2096 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002097 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002098 tmp = s->next;
2099 s->next = NULL;
2100 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002101 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002103}
2104
Benjamin Peterson0df54292012-03-26 14:50:32 -04002105/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002106
Victor Stinnerd3f08822012-05-29 12:57:52 +02002107PyObject*
2108_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002109{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002110 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002111 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002112 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002113#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002114 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002115#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002116 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002117 }
Victor Stinner785938e2011-12-11 20:09:03 +01002118 unicode = PyUnicode_New(size, 127);
2119 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002120 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002121 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002124}
2125
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002126static Py_UCS4
2127kind_maxchar_limit(unsigned int kind)
2128{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002129 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130 case PyUnicode_1BYTE_KIND:
2131 return 0x80;
2132 case PyUnicode_2BYTE_KIND:
2133 return 0x100;
2134 case PyUnicode_4BYTE_KIND:
2135 return 0x10000;
2136 default:
2137 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002138 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002139 }
2140}
2141
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002142static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002143align_maxchar(Py_UCS4 maxchar)
2144{
2145 if (maxchar <= 127)
2146 return 127;
2147 else if (maxchar <= 255)
2148 return 255;
2149 else if (maxchar <= 65535)
2150 return 65535;
2151 else
2152 return MAX_UNICODE;
2153}
2154
Victor Stinner702c7342011-10-05 13:50:52 +02002155static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002156_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002159 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002160
Serhiy Storchaka678db842013-01-26 12:16:36 +02002161 if (size == 0)
2162 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002164 if (size == 1)
2165 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002166
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002167 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002168 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 if (!res)
2170 return NULL;
2171 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002172 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002174}
2175
Victor Stinnere57b1c02011-09-28 22:20:48 +02002176static PyObject*
2177_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178{
2179 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002180 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002181
Serhiy Storchaka678db842013-01-26 12:16:36 +02002182 if (size == 0)
2183 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002185 if (size == 1)
2186 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002187
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002188 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002189 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 if (!res)
2191 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002192 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002194 else {
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2197 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002198 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 return res;
2200}
2201
Victor Stinnere57b1c02011-09-28 22:20:48 +02002202static PyObject*
2203_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204{
2205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002211 if (size == 1)
2212 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002218 if (max_char < 256)
2219 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2220 PyUnicode_1BYTE_DATA(res));
2221 else if (max_char < 0x10000)
2222 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2223 PyUnicode_2BYTE_DATA(res));
2224 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002226 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 return res;
2228}
2229
2230PyObject*
2231PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2232{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002233 if (size < 0) {
2234 PyErr_SetString(PyExc_ValueError, "size must be positive");
2235 return NULL;
2236 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002237 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002239 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002241 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245 PyErr_SetString(PyExc_SystemError, "invalid kind");
2246 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248}
2249
Victor Stinnerece58de2012-04-23 23:36:38 +02002250Py_UCS4
2251_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2252{
2253 enum PyUnicode_Kind kind;
2254 void *startptr, *endptr;
2255
2256 assert(PyUnicode_IS_READY(unicode));
2257 assert(0 <= start);
2258 assert(end <= PyUnicode_GET_LENGTH(unicode));
2259 assert(start <= end);
2260
2261 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2262 return PyUnicode_MAX_CHAR_VALUE(unicode);
2263
2264 if (start == end)
2265 return 127;
2266
Victor Stinner94d558b2012-04-27 22:26:58 +02002267 if (PyUnicode_IS_ASCII(unicode))
2268 return 127;
2269
Victor Stinnerece58de2012-04-23 23:36:38 +02002270 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002271 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002272 endptr = (char *)startptr + end * kind;
2273 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002274 switch(kind) {
2275 case PyUnicode_1BYTE_KIND:
2276 return ucs1lib_find_max_char(startptr, endptr);
2277 case PyUnicode_2BYTE_KIND:
2278 return ucs2lib_find_max_char(startptr, endptr);
2279 case PyUnicode_4BYTE_KIND:
2280 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002281 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002282 assert(0);
2283 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002284 }
2285}
2286
Victor Stinner25a4b292011-10-06 12:31:55 +02002287/* Ensure that a string uses the most efficient storage, if it is not the
2288 case: create a new string with of the right kind. Write NULL into *p_unicode
2289 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002290static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002291unicode_adjust_maxchar(PyObject **p_unicode)
2292{
2293 PyObject *unicode, *copy;
2294 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002295 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002296 unsigned int kind;
2297
2298 assert(p_unicode != NULL);
2299 unicode = *p_unicode;
2300 assert(PyUnicode_IS_READY(unicode));
2301 if (PyUnicode_IS_ASCII(unicode))
2302 return;
2303
2304 len = PyUnicode_GET_LENGTH(unicode);
2305 kind = PyUnicode_KIND(unicode);
2306 if (kind == PyUnicode_1BYTE_KIND) {
2307 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002308 max_char = ucs1lib_find_max_char(u, u + len);
2309 if (max_char >= 128)
2310 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002311 }
2312 else if (kind == PyUnicode_2BYTE_KIND) {
2313 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002314 max_char = ucs2lib_find_max_char(u, u + len);
2315 if (max_char >= 256)
2316 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002317 }
2318 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002320 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002321 max_char = ucs4lib_find_max_char(u, u + len);
2322 if (max_char >= 0x10000)
2323 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002325 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002326 if (copy != NULL)
2327 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 Py_DECREF(unicode);
2329 *p_unicode = copy;
2330}
2331
Victor Stinner034f6cf2011-09-30 02:26:44 +02002332PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002333_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002334{
Victor Stinner87af4f22011-11-21 23:03:47 +01002335 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002337
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338 if (!PyUnicode_Check(unicode)) {
2339 PyErr_BadInternalCall();
2340 return NULL;
2341 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002342 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002343 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002344
Victor Stinner87af4f22011-11-21 23:03:47 +01002345 length = PyUnicode_GET_LENGTH(unicode);
2346 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002347 if (!copy)
2348 return NULL;
2349 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2350
Victor Stinner87af4f22011-11-21 23:03:47 +01002351 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2352 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002353 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002354 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002355}
2356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357
Victor Stinnerbc603d12011-10-02 01:00:40 +02002358/* Widen Unicode objects to larger buffers. Don't write terminating null
2359 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360
2361void*
2362_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2363{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002364 Py_ssize_t len;
2365 void *result;
2366 unsigned int skind;
2367
Benjamin Petersonbac79492012-01-14 13:34:47 -05002368 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002369 return NULL;
2370
2371 len = PyUnicode_GET_LENGTH(s);
2372 skind = PyUnicode_KIND(s);
2373 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002374 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 return NULL;
2376 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002377 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002378 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002379 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002380 if (!result)
2381 return PyErr_NoMemory();
2382 assert(skind == PyUnicode_1BYTE_KIND);
2383 _PyUnicode_CONVERT_BYTES(
2384 Py_UCS1, Py_UCS2,
2385 PyUnicode_1BYTE_DATA(s),
2386 PyUnicode_1BYTE_DATA(s) + len,
2387 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002389 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002390 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002391 if (!result)
2392 return PyErr_NoMemory();
2393 if (skind == PyUnicode_2BYTE_KIND) {
2394 _PyUnicode_CONVERT_BYTES(
2395 Py_UCS2, Py_UCS4,
2396 PyUnicode_2BYTE_DATA(s),
2397 PyUnicode_2BYTE_DATA(s) + len,
2398 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 else {
2401 assert(skind == PyUnicode_1BYTE_KIND);
2402 _PyUnicode_CONVERT_BYTES(
2403 Py_UCS1, Py_UCS4,
2404 PyUnicode_1BYTE_DATA(s),
2405 PyUnicode_1BYTE_DATA(s) + len,
2406 result);
2407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002409 default:
2410 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 }
Victor Stinner01698042011-10-04 00:04:26 +02002412 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return NULL;
2414}
2415
2416static Py_UCS4*
2417as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2418 int copy_null)
2419{
2420 int kind;
2421 void *data;
2422 Py_ssize_t len, targetlen;
2423 if (PyUnicode_READY(string) == -1)
2424 return NULL;
2425 kind = PyUnicode_KIND(string);
2426 data = PyUnicode_DATA(string);
2427 len = PyUnicode_GET_LENGTH(string);
2428 targetlen = len;
2429 if (copy_null)
2430 targetlen++;
2431 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002432 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 if (!target) {
2434 PyErr_NoMemory();
2435 return NULL;
2436 }
2437 }
2438 else {
2439 if (targetsize < targetlen) {
2440 PyErr_Format(PyExc_SystemError,
2441 "string is longer than the buffer");
2442 if (copy_null && 0 < targetsize)
2443 target[0] = 0;
2444 return NULL;
2445 }
2446 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (kind == PyUnicode_1BYTE_KIND) {
2448 Py_UCS1 *start = (Py_UCS1 *) data;
2449 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 else if (kind == PyUnicode_2BYTE_KIND) {
2452 Py_UCS2 *start = (Py_UCS2 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2454 }
2455 else {
2456 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 if (copy_null)
2460 target[len] = 0;
2461 return target;
2462}
2463
2464Py_UCS4*
2465PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2466 int copy_null)
2467{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002468 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 PyErr_BadInternalCall();
2470 return NULL;
2471 }
2472 return as_ucs4(string, target, targetsize, copy_null);
2473}
2474
2475Py_UCS4*
2476PyUnicode_AsUCS4Copy(PyObject *string)
2477{
2478 return as_ucs4(string, NULL, 0, 1);
2479}
2480
2481#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002482
Alexander Belopolsky40018472011-02-26 01:02:56 +00002483PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002484PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002488 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002489 PyErr_BadInternalCall();
2490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 }
2492
Martin v. Löwis790465f2008-04-05 20:41:37 +00002493 if (size == -1) {
2494 size = wcslen(w);
2495 }
2496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002501
Victor Stinner15a11362012-10-06 23:48:20 +02002502/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002503 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2504 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2505#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002506
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002507static int
2508unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2509 Py_ssize_t width, Py_ssize_t precision)
2510{
2511 Py_ssize_t length, fill, arglen;
2512 Py_UCS4 maxchar;
2513
2514 if (PyUnicode_READY(str) == -1)
2515 return -1;
2516
2517 length = PyUnicode_GET_LENGTH(str);
2518 if ((precision == -1 || precision >= length)
2519 && width <= length)
2520 return _PyUnicodeWriter_WriteStr(writer, str);
2521
2522 if (precision != -1)
2523 length = Py_MIN(precision, length);
2524
2525 arglen = Py_MAX(length, width);
2526 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2527 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2528 else
2529 maxchar = writer->maxchar;
2530
2531 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2532 return -1;
2533
2534 if (width > length) {
2535 fill = width - length;
2536 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2537 return -1;
2538 writer->pos += fill;
2539 }
2540
2541 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2542 str, 0, length);
2543 writer->pos += length;
2544 return 0;
2545}
2546
2547static int
2548unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2549 Py_ssize_t width, Py_ssize_t precision)
2550{
2551 /* UTF-8 */
2552 Py_ssize_t length;
2553 PyObject *unicode;
2554 int res;
2555
2556 length = strlen(str);
2557 if (precision != -1)
2558 length = Py_MIN(length, precision);
2559 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2560 if (unicode == NULL)
2561 return -1;
2562
2563 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2564 Py_DECREF(unicode);
2565 return res;
2566}
2567
Victor Stinner96865452011-03-01 23:44:09 +00002568static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002569unicode_fromformat_arg(_PyUnicodeWriter *writer,
2570 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002571{
Victor Stinnere215d962012-10-06 23:03:36 +02002572 const char *p;
2573 Py_ssize_t len;
2574 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 Py_ssize_t width;
2576 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002577 int longflag;
2578 int longlongflag;
2579 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002581
2582 p = f;
2583 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002584 zeropad = 0;
2585 if (*f == '0') {
2586 zeropad = 1;
2587 f++;
2588 }
Victor Stinner96865452011-03-01 23:44:09 +00002589
2590 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002591 width = -1;
2592 if (Py_ISDIGIT((unsigned)*f)) {
2593 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002594 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002595 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002597 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002599 return NULL;
2600 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002602 f++;
2603 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 }
2605 precision = -1;
2606 if (*f == '.') {
2607 f++;
2608 if (Py_ISDIGIT((unsigned)*f)) {
2609 precision = (*f - '0');
2610 f++;
2611 while (Py_ISDIGIT((unsigned)*f)) {
2612 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2613 PyErr_SetString(PyExc_ValueError,
2614 "precision too big");
2615 return NULL;
2616 }
2617 precision = (precision * 10) + (*f - '0');
2618 f++;
2619 }
2620 }
Victor Stinner96865452011-03-01 23:44:09 +00002621 if (*f == '%') {
2622 /* "%.3%s" => f points to "3" */
2623 f--;
2624 }
2625 }
2626 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002628 f--;
2629 }
Victor Stinner96865452011-03-01 23:44:09 +00002630
2631 /* Handle %ld, %lu, %lld and %llu. */
2632 longflag = 0;
2633 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002634 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002635 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002636 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002637 longflag = 1;
2638 ++f;
2639 }
Victor Stinner96865452011-03-01 23:44:09 +00002640 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 longlongflag = 1;
2643 f += 2;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645 }
2646 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002647 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002648 size_tflag = 1;
2649 ++f;
2650 }
Victor Stinnere215d962012-10-06 23:03:36 +02002651
2652 if (f[1] == '\0')
2653 writer->overallocate = 0;
2654
2655 switch (*f) {
2656 case 'c':
2657 {
2658 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002659 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002660 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002661 "character argument not in range(0x110000)");
2662 return NULL;
2663 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667 }
2668
2669 case 'i':
2670 case 'd':
2671 case 'u':
2672 case 'x':
2673 {
2674 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002675 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002677
2678 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002679 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002680 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002681 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002682 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002683 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002684 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002685 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002686 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_arg(*vargs, size_t));
2688 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_arg(*vargs, unsigned int));
2691 }
2692 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002694 }
2695 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002699 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002700 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002701 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002702 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002703 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002704 va_arg(*vargs, Py_ssize_t));
2705 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, int));
2708 }
2709 assert(len >= 0);
2710
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (precision < len)
2712 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002713
2714 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002715 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2716 return NULL;
2717
Victor Stinnere215d962012-10-06 23:03:36 +02002718 if (width > precision) {
2719 Py_UCS4 fillchar;
2720 fill = width - precision;
2721 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002722 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2723 return NULL;
2724 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002728 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2729 return NULL;
2730 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732
Victor Stinner4a587072013-11-19 12:54:53 +01002733 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'p':
2739 {
2740 char number[MAX_LONG_LONG_CHARS];
2741
2742 len = sprintf(number, "%p", va_arg(*vargs, void*));
2743 assert(len >= 0);
2744
2745 /* %p is ill-defined: ensure leading 0x. */
2746 if (number[1] == 'X')
2747 number[1] = 'x';
2748 else if (number[1] != 'x') {
2749 memmove(number + 2, number,
2750 strlen(number) + 1);
2751 number[0] = '0';
2752 number[1] = 'x';
2753 len += 2;
2754 }
2755
Victor Stinner4a587072013-11-19 12:54:53 +01002756 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002757 return NULL;
2758 break;
2759 }
2760
2761 case 's':
2762 {
2763 /* UTF-8 */
2764 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002766 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002767 break;
2768 }
2769
2770 case 'U':
2771 {
2772 PyObject *obj = va_arg(*vargs, PyObject *);
2773 assert(obj && _PyUnicode_CHECK(obj));
2774
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002775 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002776 return NULL;
2777 break;
2778 }
2779
2780 case 'V':
2781 {
2782 PyObject *obj = va_arg(*vargs, PyObject *);
2783 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002784 if (obj) {
2785 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002787 return NULL;
2788 }
2789 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 assert(str != NULL);
2791 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002793 }
2794 break;
2795 }
2796
2797 case 'S':
2798 {
2799 PyObject *obj = va_arg(*vargs, PyObject *);
2800 PyObject *str;
2801 assert(obj);
2802 str = PyObject_Str(obj);
2803 if (!str)
2804 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002806 Py_DECREF(str);
2807 return NULL;
2808 }
2809 Py_DECREF(str);
2810 break;
2811 }
2812
2813 case 'R':
2814 {
2815 PyObject *obj = va_arg(*vargs, PyObject *);
2816 PyObject *repr;
2817 assert(obj);
2818 repr = PyObject_Repr(obj);
2819 if (!repr)
2820 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002822 Py_DECREF(repr);
2823 return NULL;
2824 }
2825 Py_DECREF(repr);
2826 break;
2827 }
2828
2829 case 'A':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *ascii;
2833 assert(obj);
2834 ascii = PyObject_ASCII(obj);
2835 if (!ascii)
2836 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002838 Py_DECREF(ascii);
2839 return NULL;
2840 }
2841 Py_DECREF(ascii);
2842 break;
2843 }
2844
2845 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002846 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849
2850 default:
2851 /* if we stumble upon an unknown formatting code, copy the rest
2852 of the format string to the output string. (we cannot just
2853 skip the code, since there's no way to know what's in the
2854 argument list) */
2855 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002856 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002857 return NULL;
2858 f = p+len;
2859 return f;
2860 }
2861
2862 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002863 return f;
2864}
2865
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866PyObject *
2867PyUnicode_FromFormatV(const char *format, va_list vargs)
2868{
Victor Stinnere215d962012-10-06 23:03:36 +02002869 va_list vargs2;
2870 const char *f;
2871 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
Victor Stinner8f674cc2013-04-17 23:02:17 +02002873 _PyUnicodeWriter_Init(&writer);
2874 writer.min_length = strlen(format) + 100;
2875 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002876
2877 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2878 Copy it to be able to pass a reference to a subfunction. */
2879 Py_VA_COPY(vargs2, vargs);
2880
2881 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002882 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002883 f = unicode_fromformat_arg(&writer, f, &vargs2);
2884 if (f == NULL)
2885 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002886 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002887 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002888 const char *p;
2889 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002890
Victor Stinnere215d962012-10-06 23:03:36 +02002891 p = f;
2892 do
2893 {
2894 if ((unsigned char)*p > 127) {
2895 PyErr_Format(PyExc_ValueError,
2896 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2897 "string, got a non-ASCII byte: 0x%02x",
2898 (unsigned char)*p);
2899 return NULL;
2900 }
2901 p++;
2902 }
2903 while (*p != '\0' && *p != '%');
2904 len = p - f;
2905
2906 if (*p == '\0')
2907 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002908
2909 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002910 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002911
2912 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002914 }
Victor Stinnere215d962012-10-06 23:03:36 +02002915 return _PyUnicodeWriter_Finish(&writer);
2916
2917 fail:
2918 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002919 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920}
2921
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922PyObject *
2923PyUnicode_FromFormat(const char *format, ...)
2924{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 PyObject* ret;
2926 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927
2928#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002930#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002933 ret = PyUnicode_FromFormatV(format, vargs);
2934 va_end(vargs);
2935 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002938#ifdef HAVE_WCHAR_H
2939
Victor Stinner5593d8a2010-10-02 11:11:27 +00002940/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2941 convert a Unicode object to a wide character string.
2942
Victor Stinnerd88d9832011-09-06 02:00:05 +02002943 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 character) required to convert the unicode object. Ignore size argument.
2945
Victor Stinnerd88d9832011-09-06 02:00:05 +02002946 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002947 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002950unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002951 wchar_t *w,
2952 Py_ssize_t size)
2953{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 const wchar_t *wstr;
2956
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002957 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 if (wstr == NULL)
2959 return -1;
2960
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002962 if (size > res)
2963 size = res + 1;
2964 else
2965 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 return res;
2968 }
2969 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002970 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002971}
2972
2973Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002974PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002975 wchar_t *w,
2976 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977{
2978 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002979 PyErr_BadInternalCall();
2980 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002982 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983}
2984
Victor Stinner137c34c2010-09-29 10:25:54 +00002985wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002986PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002987 Py_ssize_t *size)
2988{
2989 wchar_t* buffer;
2990 Py_ssize_t buflen;
2991
2992 if (unicode == NULL) {
2993 PyErr_BadInternalCall();
2994 return NULL;
2995 }
2996
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002997 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002998 if (buflen == -1)
2999 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003000 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003001 if (buffer == NULL) {
3002 PyErr_NoMemory();
3003 return NULL;
3004 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003005 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003006 if (buflen == -1) {
3007 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003008 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003009 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003010 if (size != NULL)
3011 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003012 return buffer;
3013}
3014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003015#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016
Alexander Belopolsky40018472011-02-26 01:02:56 +00003017PyObject *
3018PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003019{
Victor Stinner8faf8212011-12-08 22:14:11 +01003020 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 PyErr_SetString(PyExc_ValueError,
3022 "chr() arg not in range(0x110000)");
3023 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003025
Victor Stinner985a82a2014-01-03 12:53:47 +01003026 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003027}
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003030PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003032 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003034 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003035 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003036 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 Py_INCREF(obj);
3038 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 }
3040 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 /* For a Unicode subtype that's not a Unicode object,
3042 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003043 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003045 PyErr_Format(PyExc_TypeError,
3046 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003047 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003048 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003052PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 const char *encoding,
3054 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003055{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003056 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003057 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 PyErr_BadInternalCall();
3061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003063
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003064 /* Decoding bytes objects is the most common case and should be fast */
3065 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003066 if (PyBytes_GET_SIZE(obj) == 0)
3067 _Py_RETURN_UNICODE_EMPTY();
3068 v = PyUnicode_Decode(
3069 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3070 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003071 return v;
3072 }
3073
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003074 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 PyErr_SetString(PyExc_TypeError,
3076 "decoding str is not supported");
3077 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003078 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3081 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3082 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003083 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003084 Py_TYPE(obj)->tp_name);
3085 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003086 }
Tim Petersced69f82003-09-16 20:30:58 +00003087
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003088 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003089 PyBuffer_Release(&buffer);
3090 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003092
Serhiy Storchaka05997252013-01-26 12:14:02 +02003093 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003094 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003095 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096}
3097
Victor Stinner942889a2016-09-05 15:40:10 -07003098/* Normalize an encoding name: C implementation of
3099 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3100 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003101int
3102_Py_normalize_encoding(const char *encoding,
3103 char *lower,
3104 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003106 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003107 char *l;
3108 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003109 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110
Victor Stinner942889a2016-09-05 15:40:10 -07003111 assert(encoding != NULL);
3112
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003113 e = encoding;
3114 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003115 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003116 punct = 0;
3117 while (1) {
3118 char c = *e;
3119 if (c == 0) {
3120 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003121 }
Victor Stinner942889a2016-09-05 15:40:10 -07003122
3123 if (Py_ISALNUM(c) || c == '.') {
3124 if (punct && l != lower) {
3125 if (l == l_end) {
3126 return 0;
3127 }
3128 *l++ = '_';
3129 }
3130 punct = 0;
3131
3132 if (l == l_end) {
3133 return 0;
3134 }
3135 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003136 }
3137 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003138 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003139 }
Victor Stinner942889a2016-09-05 15:40:10 -07003140
3141 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003142 }
3143 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003144 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 Py_ssize_t size,
3150 const char *encoding,
3151 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003152{
3153 PyObject *buffer = NULL, *unicode;
3154 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003155 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3156
3157 if (encoding == NULL) {
3158 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3159 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003160
Fred Drakee4315f52000-05-09 19:53:39 +00003161 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003162 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3163 char *lower = buflower;
3164
3165 /* Fast paths */
3166 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3167 lower += 3;
3168 if (*lower == '_') {
3169 /* Match "utf8" and "utf_8" */
3170 lower++;
3171 }
3172
3173 if (lower[0] == '8' && lower[1] == 0) {
3174 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175 }
3176 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3177 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3178 }
3179 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3180 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3181 }
3182 }
3183 else {
3184 if (strcmp(lower, "ascii") == 0
3185 || strcmp(lower, "us_ascii") == 0) {
3186 return PyUnicode_DecodeASCII(s, size, errors);
3187 }
Steve Dowercc16be82016-09-08 10:35:16 -07003188 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003189 else if (strcmp(lower, "mbcs") == 0) {
3190 return PyUnicode_DecodeMBCS(s, size, errors);
3191 }
3192 #endif
3193 else if (strcmp(lower, "latin1") == 0
3194 || strcmp(lower, "latin_1") == 0
3195 || strcmp(lower, "iso_8859_1") == 0
3196 || strcmp(lower, "iso8859_1") == 0) {
3197 return PyUnicode_DecodeLatin1(s, size, errors);
3198 }
3199 }
Victor Stinner37296e82010-06-10 13:36:23 +00003200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201
3202 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003203 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003204 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003205 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003206 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (buffer == NULL)
3208 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003209 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 if (unicode == NULL)
3211 goto onError;
3212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003214 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3215 "use codecs.decode() to decode to arbitrary types",
3216 encoding,
3217 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 Py_DECREF(unicode);
3219 goto onError;
3220 }
3221 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003222 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003223
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 Py_XDECREF(buffer);
3226 return NULL;
3227}
3228
Alexander Belopolsky40018472011-02-26 01:02:56 +00003229PyObject *
3230PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003231 const char *encoding,
3232 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003233{
3234 PyObject *v;
3235
3236 if (!PyUnicode_Check(unicode)) {
3237 PyErr_BadArgument();
3238 goto onError;
3239 }
3240
3241 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003243
3244 /* Decode via the codec registry */
3245 v = PyCodec_Decode(unicode, encoding, errors);
3246 if (v == NULL)
3247 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003248 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251 return NULL;
3252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 const char *encoding,
3257 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003258{
3259 PyObject *v;
3260
3261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 goto onError;
3264 }
3265
3266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003268
3269 /* Decode via the codec registry */
3270 v = PyCodec_Decode(unicode, encoding, errors);
3271 if (v == NULL)
3272 goto onError;
3273 if (!PyUnicode_Check(v)) {
3274 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003275 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3276 "use codecs.decode() to decode to arbitrary types",
3277 encoding,
3278 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003279 Py_DECREF(v);
3280 goto onError;
3281 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003282 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003283
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003285 return NULL;
3286}
3287
Alexander Belopolsky40018472011-02-26 01:02:56 +00003288PyObject *
3289PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003290 Py_ssize_t size,
3291 const char *encoding,
3292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293{
3294 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003295
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 unicode = PyUnicode_FromUnicode(s, size);
3297 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3300 Py_DECREF(unicode);
3301 return v;
3302}
3303
Alexander Belopolsky40018472011-02-26 01:02:56 +00003304PyObject *
3305PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003306 const char *encoding,
3307 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003308{
3309 PyObject *v;
3310
3311 if (!PyUnicode_Check(unicode)) {
3312 PyErr_BadArgument();
3313 goto onError;
3314 }
3315
3316 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003317 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003318
3319 /* Encode via the codec registry */
3320 v = PyCodec_Encode(unicode, encoding, errors);
3321 if (v == NULL)
3322 goto onError;
3323 return v;
3324
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003326 return NULL;
3327}
3328
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003329static size_t
3330wcstombs_errorpos(const wchar_t *wstr)
3331{
3332 size_t len;
3333#if SIZEOF_WCHAR_T == 2
3334 wchar_t buf[3];
3335#else
3336 wchar_t buf[2];
3337#endif
3338 char outbuf[MB_LEN_MAX];
3339 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003340
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003341#if SIZEOF_WCHAR_T == 2
3342 buf[2] = 0;
3343#else
3344 buf[1] = 0;
3345#endif
3346 start = wstr;
3347 while (*wstr != L'\0')
3348 {
3349 previous = wstr;
3350#if SIZEOF_WCHAR_T == 2
3351 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3352 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3353 {
3354 buf[0] = wstr[0];
3355 buf[1] = wstr[1];
3356 wstr += 2;
3357 }
3358 else {
3359 buf[0] = *wstr;
3360 buf[1] = 0;
3361 wstr++;
3362 }
3363#else
3364 buf[0] = *wstr;
3365 wstr++;
3366#endif
3367 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003368 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003369 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 }
3371
3372 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003373 return 0;
3374}
3375
Victor Stinner1b579672011-12-17 05:47:23 +01003376static int
3377locale_error_handler(const char *errors, int *surrogateescape)
3378{
Victor Stinner50149202015-09-22 00:26:54 +02003379 _Py_error_handler error_handler = get_error_handler(errors);
3380 switch (error_handler)
3381 {
3382 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003383 *surrogateescape = 0;
3384 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003385 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003386 *surrogateescape = 1;
3387 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003388 default:
3389 PyErr_Format(PyExc_ValueError,
3390 "only 'strict' and 'surrogateescape' error handlers "
3391 "are supported, not '%s'",
3392 errors);
3393 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003394 }
Victor Stinner1b579672011-12-17 05:47:23 +01003395}
3396
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003398PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003399{
3400 Py_ssize_t wlen, wlen2;
3401 wchar_t *wstr;
3402 PyObject *bytes = NULL;
3403 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003404 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003405 PyObject *exc;
3406 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003407 int surrogateescape;
3408
3409 if (locale_error_handler(errors, &surrogateescape) < 0)
3410 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003411
3412 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3413 if (wstr == NULL)
3414 return NULL;
3415
3416 wlen2 = wcslen(wstr);
3417 if (wlen2 != wlen) {
3418 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003419 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003420 return NULL;
3421 }
3422
3423 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003424 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 char *str;
3426
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003427 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003428 if (str == NULL) {
3429 if (error_pos == (size_t)-1) {
3430 PyErr_NoMemory();
3431 PyMem_Free(wstr);
3432 return NULL;
3433 }
3434 else {
3435 goto encode_error;
3436 }
3437 }
3438 PyMem_Free(wstr);
3439
3440 bytes = PyBytes_FromString(str);
3441 PyMem_Free(str);
3442 }
3443 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003444 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003445 size_t len, len2;
3446
3447 len = wcstombs(NULL, wstr, 0);
3448 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003449 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 goto encode_error;
3451 }
3452
3453 bytes = PyBytes_FromStringAndSize(NULL, len);
3454 if (bytes == NULL) {
3455 PyMem_Free(wstr);
3456 return NULL;
3457 }
3458
3459 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3460 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003461 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 goto encode_error;
3463 }
3464 PyMem_Free(wstr);
3465 }
3466 return bytes;
3467
3468encode_error:
3469 errmsg = strerror(errno);
3470 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003471
3472 if (error_pos == (size_t)-1)
3473 error_pos = wcstombs_errorpos(wstr);
3474
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003475 PyMem_Free(wstr);
3476 Py_XDECREF(bytes);
3477
Victor Stinner2f197072011-12-17 07:08:30 +01003478 if (errmsg != NULL) {
3479 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003480 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003481 if (wstr != NULL) {
3482 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003483 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003484 } else
3485 errmsg = NULL;
3486 }
3487 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003488 reason = PyUnicode_FromString(
3489 "wcstombs() encountered an unencodable "
3490 "wide character");
3491 if (reason == NULL)
3492 return NULL;
3493
3494 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3495 "locale", unicode,
3496 (Py_ssize_t)error_pos,
3497 (Py_ssize_t)(error_pos+1),
3498 reason);
3499 Py_DECREF(reason);
3500 if (exc != NULL) {
3501 PyCodec_StrictErrors(exc);
3502 Py_XDECREF(exc);
3503 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003504 return NULL;
3505}
3506
Victor Stinnerad158722010-10-27 00:25:46 +00003507PyObject *
3508PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003509{
Steve Dowercc16be82016-09-08 10:35:16 -07003510#if defined(__APPLE__)
3511 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003512#else
Victor Stinner793b5312011-04-27 00:24:21 +02003513 PyInterpreterState *interp = PyThreadState_GET()->interp;
3514 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3515 cannot use it to encode and decode filenames before it is loaded. Load
3516 the Python codec requires to encode at least its own filename. Use the C
3517 version of the locale codec until the codec registry is initialized and
3518 the Python codec is loaded.
3519
3520 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3521 cannot only rely on it: check also interp->fscodec_initialized for
3522 subinterpreters. */
3523 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003524 return PyUnicode_AsEncodedString(unicode,
3525 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003526 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003527 }
3528 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003529 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003530 }
Victor Stinnerad158722010-10-27 00:25:46 +00003531#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003532}
3533
Alexander Belopolsky40018472011-02-26 01:02:56 +00003534PyObject *
3535PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003536 const char *encoding,
3537 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538{
3539 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003540 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003541
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 if (!PyUnicode_Check(unicode)) {
3543 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545 }
Fred Drakee4315f52000-05-09 19:53:39 +00003546
Victor Stinner942889a2016-09-05 15:40:10 -07003547 if (encoding == NULL) {
3548 return _PyUnicode_AsUTF8String(unicode, errors);
3549 }
3550
Fred Drakee4315f52000-05-09 19:53:39 +00003551 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003552 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3553 char *lower = buflower;
3554
3555 /* Fast paths */
3556 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3557 lower += 3;
3558 if (*lower == '_') {
3559 /* Match "utf8" and "utf_8" */
3560 lower++;
3561 }
3562
3563 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003564 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003565 }
3566 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3567 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3568 }
3569 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3570 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3571 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003572 }
Victor Stinner942889a2016-09-05 15:40:10 -07003573 else {
3574 if (strcmp(lower, "ascii") == 0
3575 || strcmp(lower, "us_ascii") == 0) {
3576 return _PyUnicode_AsASCIIString(unicode, errors);
3577 }
Steve Dowercc16be82016-09-08 10:35:16 -07003578#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003579 else if (strcmp(lower, "mbcs") == 0) {
3580 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3581 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003582#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003583 else if (strcmp(lower, "latin1") == 0 ||
3584 strcmp(lower, "latin_1") == 0 ||
3585 strcmp(lower, "iso_8859_1") == 0 ||
3586 strcmp(lower, "iso8859_1") == 0) {
3587 return _PyUnicode_AsLatin1String(unicode, errors);
3588 }
3589 }
Victor Stinner37296e82010-06-10 13:36:23 +00003590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591
3592 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003593 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003595 return NULL;
3596
3597 /* The normal path */
3598 if (PyBytes_Check(v))
3599 return v;
3600
3601 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003602 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003603 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003604 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003605
3606 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003607 "encoder %s returned bytearray instead of bytes; "
3608 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003609 encoding);
3610 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003611 Py_DECREF(v);
3612 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003613 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003614
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003615 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3616 Py_DECREF(v);
3617 return b;
3618 }
3619
3620 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003621 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3622 "use codecs.encode() to encode to arbitrary types",
3623 encoding,
3624 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003625 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003626 return NULL;
3627}
3628
Alexander Belopolsky40018472011-02-26 01:02:56 +00003629PyObject *
3630PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003631 const char *encoding,
3632 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003633{
3634 PyObject *v;
3635
3636 if (!PyUnicode_Check(unicode)) {
3637 PyErr_BadArgument();
3638 goto onError;
3639 }
3640
3641 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003643
3644 /* Encode via the codec registry */
3645 v = PyCodec_Encode(unicode, encoding, errors);
3646 if (v == NULL)
3647 goto onError;
3648 if (!PyUnicode_Check(v)) {
3649 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003650 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3651 "use codecs.encode() to encode to arbitrary types",
3652 encoding,
3653 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003654 Py_DECREF(v);
3655 goto onError;
3656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003658
Benjamin Peterson29060642009-01-31 22:14:21 +00003659 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 return NULL;
3661}
3662
Victor Stinner2f197072011-12-17 07:08:30 +01003663static size_t
3664mbstowcs_errorpos(const char *str, size_t len)
3665{
3666#ifdef HAVE_MBRTOWC
3667 const char *start = str;
3668 mbstate_t mbs;
3669 size_t converted;
3670 wchar_t ch;
3671
3672 memset(&mbs, 0, sizeof mbs);
3673 while (len)
3674 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003675 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003676 if (converted == 0)
3677 /* Reached end of string */
3678 break;
3679 if (converted == (size_t)-1 || converted == (size_t)-2) {
3680 /* Conversion error or incomplete character */
3681 return str - start;
3682 }
3683 else {
3684 str += converted;
3685 len -= converted;
3686 }
3687 }
3688 /* failed to find the undecodable byte sequence */
3689 return 0;
3690#endif
3691 return 0;
3692}
3693
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003694PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003696 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003697{
3698 wchar_t smallbuf[256];
3699 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3700 wchar_t *wstr;
3701 size_t wlen, wlen2;
3702 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003703 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003704 size_t error_pos;
3705 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003706 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3707 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003708
3709 if (locale_error_handler(errors, &surrogateescape) < 0)
3710 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003711
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003712 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3713 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003714 return NULL;
3715 }
3716
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003717 if (surrogateescape) {
3718 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003719 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720 if (wstr == NULL) {
3721 if (wlen == (size_t)-1)
3722 PyErr_NoMemory();
3723 else
3724 PyErr_SetFromErrno(PyExc_OSError);
3725 return NULL;
3726 }
3727
3728 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003729 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003730 }
3731 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003732 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003733#ifndef HAVE_BROKEN_MBSTOWCS
3734 wlen = mbstowcs(NULL, str, 0);
3735#else
3736 wlen = len;
3737#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003738 if (wlen == (size_t)-1)
3739 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003740 if (wlen+1 <= smallbuf_len) {
3741 wstr = smallbuf;
3742 }
3743 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003744 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003745 if (!wstr)
3746 return PyErr_NoMemory();
3747 }
3748
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003749 wlen2 = mbstowcs(wstr, str, wlen+1);
3750 if (wlen2 == (size_t)-1) {
3751 if (wstr != smallbuf)
3752 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003753 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003754 }
3755#ifdef HAVE_BROKEN_MBSTOWCS
3756 assert(wlen2 == wlen);
3757#endif
3758 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3759 if (wstr != smallbuf)
3760 PyMem_Free(wstr);
3761 }
3762 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003763
3764decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003765 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003766 errmsg = strerror(errno);
3767 assert(errmsg != NULL);
3768
3769 error_pos = mbstowcs_errorpos(str, len);
3770 if (errmsg != NULL) {
3771 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003772 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003773 if (wstr != NULL) {
3774 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003775 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003776 }
Victor Stinner2f197072011-12-17 07:08:30 +01003777 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003778 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003779 reason = PyUnicode_FromString(
3780 "mbstowcs() encountered an invalid multibyte sequence");
3781 if (reason == NULL)
3782 return NULL;
3783
3784 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3785 "locale", str, len,
3786 (Py_ssize_t)error_pos,
3787 (Py_ssize_t)(error_pos+1),
3788 reason);
3789 Py_DECREF(reason);
3790 if (exc != NULL) {
3791 PyCodec_StrictErrors(exc);
3792 Py_XDECREF(exc);
3793 }
3794 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003795}
3796
3797PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003798PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003799{
3800 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003801 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003802}
3803
3804
3805PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003806PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003807 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003808 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3809}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003810
Christian Heimes5894ba72007-11-04 11:43:14 +00003811PyObject*
3812PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3813{
Steve Dowercc16be82016-09-08 10:35:16 -07003814#if defined(__APPLE__)
3815 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003816#else
Victor Stinner793b5312011-04-27 00:24:21 +02003817 PyInterpreterState *interp = PyThreadState_GET()->interp;
3818 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3819 cannot use it to encode and decode filenames before it is loaded. Load
3820 the Python codec requires to encode at least its own filename. Use the C
3821 version of the locale codec until the codec registry is initialized and
3822 the Python codec is loaded.
3823
3824 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3825 cannot only rely on it: check also interp->fscodec_initialized for
3826 subinterpreters. */
3827 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dowercc16be82016-09-08 10:35:16 -07003828 PyObject *res = PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003829 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003830 Py_FileSystemDefaultEncodeErrors);
3831#ifdef MS_WINDOWS
3832 if (!res && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
3833 PyObject *exc, *val, *tb;
3834 PyErr_Fetch(&exc, &val, &tb);
3835 PyErr_Format(PyExc_RuntimeError,
3836 "filesystem path bytes were not correctly encoded with '%s'. " \
3837 "Please report this at http://bugs.python.org/issue27781",
3838 Py_FileSystemDefaultEncoding);
3839 _PyErr_ChainExceptions(exc, val, tb);
3840 }
3841#endif
3842 return res;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003843 }
3844 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003845 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003846 }
Victor Stinnerad158722010-10-27 00:25:46 +00003847#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848}
3849
Martin v. Löwis011e8422009-05-05 04:43:17 +00003850
3851int
3852PyUnicode_FSConverter(PyObject* arg, void* addr)
3853{
Brett Cannonec6ce872016-09-06 15:50:29 -07003854 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003855 PyObject *output = NULL;
3856 Py_ssize_t size;
3857 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003858 if (arg == NULL) {
3859 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003860 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003861 return 1;
3862 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003863 path = PyOS_FSPath(arg);
3864 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003865 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003866 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003867 if (PyBytes_Check(path)) {
3868 output = path;
3869 }
3870 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3871 output = PyUnicode_EncodeFSDefault(path);
3872 Py_DECREF(path);
3873 if (!output) {
3874 return 0;
3875 }
3876 assert(PyBytes_Check(output));
3877 }
3878
Victor Stinner0ea2a462010-04-30 00:22:08 +00003879 size = PyBytes_GET_SIZE(output);
3880 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003881 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003882 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003883 Py_DECREF(output);
3884 return 0;
3885 }
3886 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003887 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003888}
3889
3890
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003891int
3892PyUnicode_FSDecoder(PyObject* arg, void* addr)
3893{
Brett Cannona5711202016-09-06 19:36:01 -07003894 int is_buffer = 0;
3895 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003896 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003897 if (arg == NULL) {
3898 Py_DECREF(*(PyObject**)addr);
3899 return 1;
3900 }
Brett Cannona5711202016-09-06 19:36:01 -07003901
3902 is_buffer = PyObject_CheckBuffer(arg);
3903 if (!is_buffer) {
3904 path = PyOS_FSPath(arg);
3905 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003906 return 0;
3907 }
Brett Cannona5711202016-09-06 19:36:01 -07003908 }
3909 else {
3910 path = arg;
3911 Py_INCREF(arg);
3912 }
3913
3914 if (PyUnicode_Check(path)) {
3915 if (PyUnicode_READY(path) == -1) {
3916 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003917 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003918 }
3919 output = path;
3920 }
3921 else if (PyBytes_Check(path) || is_buffer) {
3922 PyObject *path_bytes = NULL;
3923
3924 if (!PyBytes_Check(path) &&
3925 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3926 "path should be string, bytes, or os.PathLike, not %.200s",
3927 Py_TYPE(arg)->tp_name)) {
3928 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003929 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003930 }
3931 path_bytes = PyBytes_FromObject(path);
3932 Py_DECREF(path);
3933 if (!path_bytes) {
3934 return 0;
3935 }
3936 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3937 PyBytes_GET_SIZE(path_bytes));
3938 Py_DECREF(path_bytes);
3939 if (!output) {
3940 return 0;
3941 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003942 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003943 else {
3944 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003945 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003946 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003947 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003948 return 0;
3949 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003950 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003951 Py_DECREF(output);
3952 return 0;
3953 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003955 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003956 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003957 Py_DECREF(output);
3958 return 0;
3959 }
3960 *(PyObject**)addr = output;
3961 return Py_CLEANUP_SUPPORTED;
3962}
3963
3964
Martin v. Löwis5b222132007-06-10 09:51:05 +00003965char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003967{
Christian Heimesf3863112007-11-22 07:46:41 +00003968 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003970 if (!PyUnicode_Check(unicode)) {
3971 PyErr_BadArgument();
3972 return NULL;
3973 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003974 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003975 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003977 if (PyUnicode_UTF8(unicode) == NULL) {
3978 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003979 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 if (bytes == NULL)
3981 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003982 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3983 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003984 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 Py_DECREF(bytes);
3986 return NULL;
3987 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003988 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3989 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3990 PyBytes_AS_STRING(bytes),
3991 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 Py_DECREF(bytes);
3993 }
3994
3995 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003996 *psize = PyUnicode_UTF8_LENGTH(unicode);
3997 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003998}
3999
4000char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4004}
4005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006Py_UNICODE *
4007PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 const unsigned char *one_byte;
4010#if SIZEOF_WCHAR_T == 4
4011 const Py_UCS2 *two_bytes;
4012#else
4013 const Py_UCS4 *four_bytes;
4014 const Py_UCS4 *ucs4_end;
4015 Py_ssize_t num_surrogates;
4016#endif
4017 wchar_t *w;
4018 wchar_t *wchar_end;
4019
4020 if (!PyUnicode_Check(unicode)) {
4021 PyErr_BadArgument();
4022 return NULL;
4023 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004024 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004026 assert(_PyUnicode_KIND(unicode) != 0);
4027 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004029 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4032 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 num_surrogates = 0;
4034
4035 for (; four_bytes < ucs4_end; ++four_bytes) {
4036 if (*four_bytes > 0xFFFF)
4037 ++num_surrogates;
4038 }
4039
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004040 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4041 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4042 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 PyErr_NoMemory();
4044 return NULL;
4045 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004048 w = _PyUnicode_WSTR(unicode);
4049 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4050 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4052 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004053 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004055 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4056 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 }
4058 else
4059 *w = *four_bytes;
4060
4061 if (w > wchar_end) {
4062 assert(0 && "Miscalculated string end");
4063 }
4064 }
4065 *w = 0;
4066#else
4067 /* sizeof(wchar_t) == 4 */
4068 Py_FatalError("Impossible unicode object state, wstr and str "
4069 "should share memory already.");
4070 return NULL;
4071#endif
4072 }
4073 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004074 if ((size_t)_PyUnicode_LENGTH(unicode) >
4075 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4076 PyErr_NoMemory();
4077 return NULL;
4078 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004079 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4080 (_PyUnicode_LENGTH(unicode) + 1));
4081 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 PyErr_NoMemory();
4083 return NULL;
4084 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004085 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4086 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4087 w = _PyUnicode_WSTR(unicode);
4088 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004090 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4091 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 for (; w < wchar_end; ++one_byte, ++w)
4093 *w = *one_byte;
4094 /* null-terminate the wstr */
4095 *w = 0;
4096 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004097 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004099 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 for (; w < wchar_end; ++two_bytes, ++w)
4101 *w = *two_bytes;
4102 /* null-terminate the wstr */
4103 *w = 0;
4104#else
4105 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 PyObject_FREE(_PyUnicode_WSTR(unicode));
4107 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 Py_FatalError("Impossible unicode object state, wstr "
4109 "and str should share memory already.");
4110 return NULL;
4111#endif
4112 }
4113 else {
4114 assert(0 && "This should never happen.");
4115 }
4116 }
4117 }
4118 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004119 *size = PyUnicode_WSTR_LENGTH(unicode);
4120 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004121}
4122
Alexander Belopolsky40018472011-02-26 01:02:56 +00004123Py_UNICODE *
4124PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127}
4128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129
Alexander Belopolsky40018472011-02-26 01:02:56 +00004130Py_ssize_t
4131PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
4133 if (!PyUnicode_Check(unicode)) {
4134 PyErr_BadArgument();
4135 goto onError;
4136 }
4137 return PyUnicode_GET_SIZE(unicode);
4138
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 return -1;
4141}
4142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004143Py_ssize_t
4144PyUnicode_GetLength(PyObject *unicode)
4145{
Victor Stinner07621332012-06-16 04:53:46 +02004146 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 PyErr_BadArgument();
4148 return -1;
4149 }
Victor Stinner07621332012-06-16 04:53:46 +02004150 if (PyUnicode_READY(unicode) == -1)
4151 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152 return PyUnicode_GET_LENGTH(unicode);
4153}
4154
4155Py_UCS4
4156PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4157{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004158 void *data;
4159 int kind;
4160
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004161 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4162 PyErr_BadArgument();
4163 return (Py_UCS4)-1;
4164 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004165 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004166 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 return (Py_UCS4)-1;
4168 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004169 data = PyUnicode_DATA(unicode);
4170 kind = PyUnicode_KIND(unicode);
4171 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172}
4173
4174int
4175PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4176{
4177 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004178 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 return -1;
4180 }
Victor Stinner488fa492011-12-12 00:01:39 +01004181 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004182 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004183 PyErr_SetString(PyExc_IndexError, "string index out of range");
4184 return -1;
4185 }
Victor Stinner488fa492011-12-12 00:01:39 +01004186 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004187 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004188 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4189 PyErr_SetString(PyExc_ValueError, "character out of range");
4190 return -1;
4191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4193 index, ch);
4194 return 0;
4195}
4196
Alexander Belopolsky40018472011-02-26 01:02:56 +00004197const char *
4198PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004199{
Victor Stinner42cb4622010-09-01 19:39:01 +00004200 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004201}
4202
Victor Stinner554f3f02010-06-16 23:33:54 +00004203/* create or adjust a UnicodeDecodeError */
4204static void
4205make_decode_exception(PyObject **exceptionObject,
4206 const char *encoding,
4207 const char *input, Py_ssize_t length,
4208 Py_ssize_t startpos, Py_ssize_t endpos,
4209 const char *reason)
4210{
4211 if (*exceptionObject == NULL) {
4212 *exceptionObject = PyUnicodeDecodeError_Create(
4213 encoding, input, length, startpos, endpos, reason);
4214 }
4215 else {
4216 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4217 goto onError;
4218 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4219 goto onError;
4220 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4221 goto onError;
4222 }
4223 return;
4224
4225onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004226 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004227}
4228
Steve Dowercc16be82016-09-08 10:35:16 -07004229#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230/* error handling callback helper:
4231 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004232 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 and adjust various state variables.
4234 return 0 on success, -1 on error
4235*/
4236
Alexander Belopolsky40018472011-02-26 01:02:56 +00004237static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004238unicode_decode_call_errorhandler_wchar(
4239 const char *errors, PyObject **errorHandler,
4240 const char *encoding, const char *reason,
4241 const char **input, const char **inend, Py_ssize_t *startinpos,
4242 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4243 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004245 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246
4247 PyObject *restuple = NULL;
4248 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004249 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004250 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004251 Py_ssize_t requiredsize;
4252 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004253 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004254 wchar_t *repwstr;
4255 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004257 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4258 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004259
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 *errorHandler = PyCodec_LookupError(errors);
4262 if (*errorHandler == NULL)
4263 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 }
4265
Victor Stinner554f3f02010-06-16 23:33:54 +00004266 make_decode_exception(exceptionObject,
4267 encoding,
4268 *input, *inend - *input,
4269 *startinpos, *endinpos,
4270 reason);
4271 if (*exceptionObject == NULL)
4272 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273
4274 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4275 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004278 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 }
4281 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283
4284 /* Copy back the bytes variables, which might have been modified by the
4285 callback */
4286 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4287 if (!inputobj)
4288 goto onError;
4289 if (!PyBytes_Check(inputobj)) {
4290 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4291 }
4292 *input = PyBytes_AS_STRING(inputobj);
4293 insize = PyBytes_GET_SIZE(inputobj);
4294 *inend = *input + insize;
4295 /* we can DECREF safely, as the exception has another reference,
4296 so the object won't go away. */
4297 Py_DECREF(inputobj);
4298
4299 if (newpos<0)
4300 newpos = insize+newpos;
4301 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004302 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 goto onError;
4304 }
4305
4306 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4307 if (repwstr == NULL)
4308 goto onError;
4309 /* need more space? (at least enough for what we
4310 have+the replacement+the rest of the string (starting
4311 at the new input position), so we won't have to check space
4312 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004313 requiredsize = *outpos;
4314 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4315 goto overflow;
4316 requiredsize += repwlen;
4317 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4318 goto overflow;
4319 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004321 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322 requiredsize = 2*outsize;
4323 if (unicode_resize(output, requiredsize) < 0)
4324 goto onError;
4325 }
4326 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4327 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 *endinpos = newpos;
4329 *inptr = *input + newpos;
4330
4331 /* we made it! */
4332 Py_XDECREF(restuple);
4333 return 0;
4334
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004335 overflow:
4336 PyErr_SetString(PyExc_OverflowError,
4337 "decoded result is too long for a Python string");
4338
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 onError:
4340 Py_XDECREF(restuple);
4341 return -1;
4342}
Steve Dowercc16be82016-09-08 10:35:16 -07004343#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344
4345static int
4346unicode_decode_call_errorhandler_writer(
4347 const char *errors, PyObject **errorHandler,
4348 const char *encoding, const char *reason,
4349 const char **input, const char **inend, Py_ssize_t *startinpos,
4350 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4351 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4352{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004353 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354
4355 PyObject *restuple = NULL;
4356 PyObject *repunicode = NULL;
4357 Py_ssize_t insize;
4358 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004359 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 PyObject *inputobj = NULL;
4361
4362 if (*errorHandler == NULL) {
4363 *errorHandler = PyCodec_LookupError(errors);
4364 if (*errorHandler == NULL)
4365 goto onError;
4366 }
4367
4368 make_decode_exception(exceptionObject,
4369 encoding,
4370 *input, *inend - *input,
4371 *startinpos, *endinpos,
4372 reason);
4373 if (*exceptionObject == NULL)
4374 goto onError;
4375
4376 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4377 if (restuple == NULL)
4378 goto onError;
4379 if (!PyTuple_Check(restuple)) {
4380 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4381 goto onError;
4382 }
4383 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004384 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004385
4386 /* Copy back the bytes variables, which might have been modified by the
4387 callback */
4388 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4389 if (!inputobj)
4390 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004391 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004393 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004394 *input = PyBytes_AS_STRING(inputobj);
4395 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004396 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004397 /* we can DECREF safely, as the exception has another reference,
4398 so the object won't go away. */
4399 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004403 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004404 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004406 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407
Victor Stinner8f674cc2013-04-17 23:02:17 +02004408 if (PyUnicode_READY(repunicode) < 0)
4409 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004410 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004411 if (replen > 1) {
4412 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004413 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004414 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4415 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4416 goto onError;
4417 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004419 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004422 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 Py_XDECREF(restuple);
4426 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431}
4432
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433/* --- UTF-7 Codec -------------------------------------------------------- */
4434
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435/* See RFC2152 for details. We encode conservatively and decode liberally. */
4436
4437/* Three simple macros defining base-64. */
4438
4439/* Is c a base-64 character? */
4440
4441#define IS_BASE64(c) \
4442 (((c) >= 'A' && (c) <= 'Z') || \
4443 ((c) >= 'a' && (c) <= 'z') || \
4444 ((c) >= '0' && (c) <= '9') || \
4445 (c) == '+' || (c) == '/')
4446
4447/* given that c is a base-64 character, what is its base-64 value? */
4448
4449#define FROM_BASE64(c) \
4450 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4451 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4452 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4453 (c) == '+' ? 62 : 63)
4454
4455/* What is the base-64 character of the bottom 6 bits of n? */
4456
4457#define TO_BASE64(n) \
4458 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4459
4460/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4461 * decoded as itself. We are permissive on decoding; the only ASCII
4462 * byte not decoding to itself is the + which begins a base64
4463 * string. */
4464
4465#define DECODE_DIRECT(c) \
4466 ((c) <= 127 && (c) != '+')
4467
4468/* The UTF-7 encoder treats ASCII characters differently according to
4469 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4470 * the above). See RFC2152. This array identifies these different
4471 * sets:
4472 * 0 : "Set D"
4473 * alphanumeric and '(),-./:?
4474 * 1 : "Set O"
4475 * !"#$%&*;<=>@[]^_`{|}
4476 * 2 : "whitespace"
4477 * ht nl cr sp
4478 * 3 : special (must be base64 encoded)
4479 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4480 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481
Tim Petersced69f82003-09-16 20:30:58 +00004482static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483char utf7_category[128] = {
4484/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4485 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4486/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4487 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4488/* sp ! " # $ % & ' ( ) * + , - . / */
4489 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4490/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4492/* @ A B C D E F G H I J K L M N O */
4493 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4494/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4496/* ` a b c d e f g h i j k l m n o */
4497 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4498/* p q r s t u v w x y z { | } ~ del */
4499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500};
4501
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502/* ENCODE_DIRECT: this character should be encoded as itself. The
4503 * answer depends on whether we are encoding set O as itself, and also
4504 * on whether we are encoding whitespace as itself. RFC2152 makes it
4505 * clear that the answers to these questions vary between
4506 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004507
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508#define ENCODE_DIRECT(c, directO, directWS) \
4509 ((c) < 128 && (c) > 0 && \
4510 ((utf7_category[(c)] == 0) || \
4511 (directWS && (utf7_category[(c)] == 2)) || \
4512 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Alexander Belopolsky40018472011-02-26 01:02:56 +00004514PyObject *
4515PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004516 Py_ssize_t size,
4517 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004519 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4520}
4521
Antoine Pitrou244651a2009-05-04 18:56:13 +00004522/* The decoder. The only state we preserve is our read position,
4523 * i.e. how many characters we have consumed. So if we end in the
4524 * middle of a shift sequence we have to back off the read position
4525 * and the output to the beginning of the sequence, otherwise we lose
4526 * all the shift state (seen bits, number of bits seen, high
4527 * surrogate). */
4528
Alexander Belopolsky40018472011-02-26 01:02:56 +00004529PyObject *
4530PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004531 Py_ssize_t size,
4532 const char *errors,
4533 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004534{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004536 Py_ssize_t startinpos;
4537 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004539 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 const char *errmsg = "";
4541 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004542 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 unsigned int base64bits = 0;
4544 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004545 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 PyObject *errorHandler = NULL;
4547 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004549 if (size == 0) {
4550 if (consumed)
4551 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004552 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004553 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004555 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004556 _PyUnicodeWriter_Init(&writer);
4557 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004558
4559 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560 e = s + size;
4561
4562 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004563 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004565 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 if (inShift) { /* in a base-64 section */
4568 if (IS_BASE64(ch)) { /* consume a base-64 character */
4569 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4570 base64bits += 6;
4571 s++;
4572 if (base64bits >= 16) {
4573 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004574 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 base64bits -= 16;
4576 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004577 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 if (surrogate) {
4579 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004580 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4581 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004582 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004583 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004585 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 }
4587 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004588 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004589 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 }
4592 }
Victor Stinner551ac952011-11-29 22:58:13 +01004593 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 /* first surrogate */
4595 surrogate = outCh;
4596 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004598 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 }
4601 }
4602 }
4603 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (base64bits > 0) { /* left-over bits */
4606 if (base64bits >= 6) {
4607 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004608 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 errmsg = "partial character in shift sequence";
4610 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 else {
4613 /* Some bits remain; they should be zero */
4614 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004615 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 errmsg = "non-zero padding bits in shift sequence";
4617 goto utf7Error;
4618 }
4619 }
4620 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004621 if (surrogate && DECODE_DIRECT(ch)) {
4622 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4623 goto onError;
4624 }
4625 surrogate = 0;
4626 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 /* '-' is absorbed; other terminating
4628 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004629 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004631 }
4632 }
4633 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 s++; /* consume '+' */
4636 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004638 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004639 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 }
4641 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004643 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004644 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004646 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 }
4648 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004651 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 else {
4655 startinpos = s-starts;
4656 s++;
4657 errmsg = "unexpected special character";
4658 goto utf7Error;
4659 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 errors, &errorHandler,
4665 "utf7", errmsg,
4666 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 }
4670
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 /* end of string */
4672
4673 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4674 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004675 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 if (surrogate ||
4677 (base64bits >= 6) ||
4678 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 errors, &errorHandler,
4682 "utf7", "unterminated shift sequence",
4683 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 goto onError;
4686 if (s < e)
4687 goto restart;
4688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690
4691 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004692 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004694 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004695 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004696 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004697 writer.kind, writer.data, shiftOutStart);
4698 Py_XDECREF(errorHandler);
4699 Py_XDECREF(exc);
4700 _PyUnicodeWriter_Dealloc(&writer);
4701 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004702 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004703 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704 }
4705 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004706 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 Py_XDECREF(errorHandler);
4711 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004715 Py_XDECREF(errorHandler);
4716 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004717 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004718 return NULL;
4719}
4720
4721
Alexander Belopolsky40018472011-02-26 01:02:56 +00004722PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723_PyUnicode_EncodeUTF7(PyObject *str,
4724 int base64SetO,
4725 int base64WhiteSpace,
4726 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004728 int kind;
4729 void *data;
4730 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004731 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004732 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004733 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004734 unsigned int base64bits = 0;
4735 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736 char * out;
4737 char * start;
4738
Benjamin Petersonbac79492012-01-14 13:34:47 -05004739 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740 return NULL;
4741 kind = PyUnicode_KIND(str);
4742 data = PyUnicode_DATA(str);
4743 len = PyUnicode_GET_LENGTH(str);
4744
4745 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004748 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004749 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004750 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004751 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752 if (v == NULL)
4753 return NULL;
4754
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004755 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004756 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004757 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758
Antoine Pitrou244651a2009-05-04 18:56:13 +00004759 if (inShift) {
4760 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4761 /* shifting out */
4762 if (base64bits) { /* output remaining bits */
4763 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4764 base64buffer = 0;
4765 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766 }
4767 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 /* Characters not in the BASE64 set implicitly unshift the sequence
4769 so no '-' is required, except if the character is itself a '-' */
4770 if (IS_BASE64(ch) || ch == '-') {
4771 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004773 *out++ = (char) ch;
4774 }
4775 else {
4776 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004777 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 else { /* not in a shift sequence */
4780 if (ch == '+') {
4781 *out++ = '+';
4782 *out++ = '-';
4783 }
4784 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4785 *out++ = (char) ch;
4786 }
4787 else {
4788 *out++ = '+';
4789 inShift = 1;
4790 goto encode_char;
4791 }
4792 }
4793 continue;
4794encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004796 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004797
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 /* code first surrogate */
4799 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004800 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 while (base64bits >= 6) {
4802 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4803 base64bits -= 6;
4804 }
4805 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004806 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 base64bits += 16;
4809 base64buffer = (base64buffer << 16) | ch;
4810 while (base64bits >= 6) {
4811 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4812 base64bits -= 6;
4813 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004814 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 if (base64bits)
4816 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4817 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004818 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004819 if (_PyBytes_Resize(&v, out - start) < 0)
4820 return NULL;
4821 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004822}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004823PyObject *
4824PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4825 Py_ssize_t size,
4826 int base64SetO,
4827 int base64WhiteSpace,
4828 const char *errors)
4829{
4830 PyObject *result;
4831 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4832 if (tmp == NULL)
4833 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004834 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004835 base64WhiteSpace, errors);
4836 Py_DECREF(tmp);
4837 return result;
4838}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004839
Antoine Pitrou244651a2009-05-04 18:56:13 +00004840#undef IS_BASE64
4841#undef FROM_BASE64
4842#undef TO_BASE64
4843#undef DECODE_DIRECT
4844#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846/* --- UTF-8 Codec -------------------------------------------------------- */
4847
Alexander Belopolsky40018472011-02-26 01:02:56 +00004848PyObject *
4849PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004850 Py_ssize_t size,
4851 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852{
Walter Dörwald69652032004-09-07 20:24:22 +00004853 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4854}
4855
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856#include "stringlib/asciilib.h"
4857#include "stringlib/codecs.h"
4858#include "stringlib/undef.h"
4859
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004860#include "stringlib/ucs1lib.h"
4861#include "stringlib/codecs.h"
4862#include "stringlib/undef.h"
4863
4864#include "stringlib/ucs2lib.h"
4865#include "stringlib/codecs.h"
4866#include "stringlib/undef.h"
4867
4868#include "stringlib/ucs4lib.h"
4869#include "stringlib/codecs.h"
4870#include "stringlib/undef.h"
4871
Antoine Pitrouab868312009-01-10 15:40:25 +00004872/* Mask to quickly check whether a C 'long' contains a
4873 non-ASCII, UTF8-encoded char. */
4874#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004875# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004876#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004877# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004878#else
4879# error C 'long' size should be either 4 or 8!
4880#endif
4881
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882static Py_ssize_t
4883ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004886 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004888 /*
4889 * Issue #17237: m68k is a bit different from most architectures in
4890 * that objects do not use "natural alignment" - for example, int and
4891 * long are only aligned at 2-byte boundaries. Therefore the assert()
4892 * won't work; also, tests have shown that skipping the "optimised
4893 * version" will even speed up m68k.
4894 */
4895#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004897 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4898 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 /* Fast path, see in STRINGLIB(utf8_decode) for
4900 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004901 /* Help allocation */
4902 const char *_p = p;
4903 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904 while (_p < aligned_end) {
4905 unsigned long value = *(const unsigned long *) _p;
4906 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 *((unsigned long *)q) = value;
4909 _p += SIZEOF_LONG;
4910 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004911 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 p = _p;
4913 while (p < end) {
4914 if ((unsigned char)*p & 0x80)
4915 break;
4916 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004921#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 while (p < end) {
4923 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4924 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004925 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004926 /* Help allocation */
4927 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 while (_p < aligned_end) {
4929 unsigned long value = *(unsigned long *) _p;
4930 if (value & ASCII_CHAR_MASK)
4931 break;
4932 _p += SIZEOF_LONG;
4933 }
4934 p = _p;
4935 if (_p == end)
4936 break;
4937 }
4938 if ((unsigned char)*p & 0x80)
4939 break;
4940 ++p;
4941 }
4942 memcpy(dest, start, p - start);
4943 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944}
Antoine Pitrouab868312009-01-10 15:40:25 +00004945
Victor Stinner785938e2011-12-11 20:09:03 +01004946PyObject *
4947PyUnicode_DecodeUTF8Stateful(const char *s,
4948 Py_ssize_t size,
4949 const char *errors,
4950 Py_ssize_t *consumed)
4951{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004952 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004953 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955
4956 Py_ssize_t startinpos;
4957 Py_ssize_t endinpos;
4958 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004959 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004961 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004962
4963 if (size == 0) {
4964 if (consumed)
4965 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004966 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004967 }
4968
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004969 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4970 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004971 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972 *consumed = 1;
4973 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004974 }
4975
Victor Stinner8f674cc2013-04-17 23:02:17 +02004976 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004977 writer.min_length = size;
4978 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004979 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004980
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 writer.pos = ascii_decode(s, end, writer.data);
4982 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004983 while (s < end) {
4984 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004986
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 if (PyUnicode_IS_ASCII(writer.buffer))
4989 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004992 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 } else {
4995 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 }
4998
4999 switch (ch) {
5000 case 0:
5001 if (s == end || consumed)
5002 goto End;
5003 errmsg = "unexpected end of data";
5004 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005005 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 break;
5007 case 1:
5008 errmsg = "invalid start byte";
5009 startinpos = s - starts;
5010 endinpos = startinpos + 1;
5011 break;
5012 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005013 case 3:
5014 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 errmsg = "invalid continuation byte";
5016 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005017 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 break;
5019 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005020 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 goto onError;
5022 continue;
5023 }
5024
Victor Stinner1d65d912015-10-05 13:43:50 +02005025 if (error_handler == _Py_ERROR_UNKNOWN)
5026 error_handler = get_error_handler(errors);
5027
5028 switch (error_handler) {
5029 case _Py_ERROR_IGNORE:
5030 s += (endinpos - startinpos);
5031 break;
5032
5033 case _Py_ERROR_REPLACE:
5034 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5035 goto onError;
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005040 {
5041 Py_ssize_t i;
5042
Victor Stinner1d65d912015-10-05 13:43:50 +02005043 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5044 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005045 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005046 ch = (Py_UCS4)(unsigned char)(starts[i]);
5047 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5048 ch + 0xdc00);
5049 writer.pos++;
5050 }
5051 s += (endinpos - startinpos);
5052 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005053 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005054
5055 default:
5056 if (unicode_decode_call_errorhandler_writer(
5057 errors, &error_handler_obj,
5058 "utf-8", errmsg,
5059 &starts, &end, &startinpos, &endinpos, &exc, &s,
5060 &writer))
5061 goto onError;
5062 }
Victor Stinner785938e2011-12-11 20:09:03 +01005063 }
5064
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005065End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005066 if (consumed)
5067 *consumed = s - starts;
5068
Victor Stinner1d65d912015-10-05 13:43:50 +02005069 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005070 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005071 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072
5073onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005074 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005076 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005078}
5079
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005080#ifdef __APPLE__
5081
5082/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005083 used to decode the command line arguments on Mac OS X.
5084
5085 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005086 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005087
5088wchar_t*
5089_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5090{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005091 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 wchar_t *unicode;
5093 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005094
5095 /* Note: size will always be longer than the resulting Unicode
5096 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005097 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005099 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100 if (!unicode)
5101 return NULL;
5102
5103 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005105 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005108#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 if (ch > 0xFF) {
5114#if SIZEOF_WCHAR_T == 4
5115 assert(0);
5116#else
5117 assert(Py_UNICODE_IS_SURROGATE(ch));
5118 /* compute and append the two surrogates: */
5119 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5120 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5121#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005122 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005123 else {
5124 if (!ch && s == e)
5125 break;
5126 /* surrogateescape */
5127 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5128 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005131 return unicode;
5132}
5133
5134#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005136/* Primary internal function which creates utf8 encoded bytes objects.
5137
5138 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005139 and allocate exactly as much space needed at the end. Else allocate the
5140 maximum possible needed (4 result bytes per Unicode character), and return
5141 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005142*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005143PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005144_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145{
Victor Stinner6099a032011-12-18 14:22:26 +01005146 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005147 void *data;
5148 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005150 if (!PyUnicode_Check(unicode)) {
5151 PyErr_BadArgument();
5152 return NULL;
5153 }
5154
5155 if (PyUnicode_READY(unicode) == -1)
5156 return NULL;
5157
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005158 if (PyUnicode_UTF8(unicode))
5159 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5160 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161
5162 kind = PyUnicode_KIND(unicode);
5163 data = PyUnicode_DATA(unicode);
5164 size = PyUnicode_GET_LENGTH(unicode);
5165
Benjamin Petersonead6b532011-12-20 17:23:42 -06005166 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005167 default:
5168 assert(0);
5169 case PyUnicode_1BYTE_KIND:
5170 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5171 assert(!PyUnicode_IS_ASCII(unicode));
5172 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5173 case PyUnicode_2BYTE_KIND:
5174 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5175 case PyUnicode_4BYTE_KIND:
5176 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178}
5179
Alexander Belopolsky40018472011-02-26 01:02:56 +00005180PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005181PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5182 Py_ssize_t size,
5183 const char *errors)
5184{
5185 PyObject *v, *unicode;
5186
5187 unicode = PyUnicode_FromUnicode(s, size);
5188 if (unicode == NULL)
5189 return NULL;
5190 v = _PyUnicode_AsUTF8String(unicode, errors);
5191 Py_DECREF(unicode);
5192 return v;
5193}
5194
5195PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005196PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005198 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199}
5200
Walter Dörwald41980ca2007-08-16 21:55:45 +00005201/* --- UTF-32 Codec ------------------------------------------------------- */
5202
5203PyObject *
5204PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 Py_ssize_t size,
5206 const char *errors,
5207 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005208{
5209 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5210}
5211
5212PyObject *
5213PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 Py_ssize_t size,
5215 const char *errors,
5216 int *byteorder,
5217 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005218{
5219 const char *starts = s;
5220 Py_ssize_t startinpos;
5221 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005222 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005223 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005224 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005225 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005227 PyObject *errorHandler = NULL;
5228 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005229
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230 q = (unsigned char *)s;
5231 e = q + size;
5232
5233 if (byteorder)
5234 bo = *byteorder;
5235
5236 /* Check for BOM marks (U+FEFF) in the input and adjust current
5237 byte order setting accordingly. In native mode, the leading BOM
5238 mark is skipped, in all other modes, it is copied to the output
5239 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005240 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005241 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005242 if (bom == 0x0000FEFF) {
5243 bo = -1;
5244 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 else if (bom == 0xFFFE0000) {
5247 bo = 1;
5248 q += 4;
5249 }
5250 if (byteorder)
5251 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 }
5253
Victor Stinnere64322e2012-10-30 23:12:47 +01005254 if (q == e) {
5255 if (consumed)
5256 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005257 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258 }
5259
Victor Stinnere64322e2012-10-30 23:12:47 +01005260#ifdef WORDS_BIGENDIAN
5261 le = bo < 0;
5262#else
5263 le = bo <= 0;
5264#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005265 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005266
Victor Stinner8f674cc2013-04-17 23:02:17 +02005267 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005268 writer.min_length = (e - q + 3) / 4;
5269 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005270 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005271
Victor Stinnere64322e2012-10-30 23:12:47 +01005272 while (1) {
5273 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005274 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005275
Victor Stinnere64322e2012-10-30 23:12:47 +01005276 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005277 enum PyUnicode_Kind kind = writer.kind;
5278 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005280 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005281 if (le) {
5282 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005283 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005284 if (ch > maxch)
5285 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005286 if (kind != PyUnicode_1BYTE_KIND &&
5287 Py_UNICODE_IS_SURROGATE(ch))
5288 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005289 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 q += 4;
5291 } while (q <= last);
5292 }
5293 else {
5294 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005295 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 if (ch > maxch)
5297 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005298 if (kind != PyUnicode_1BYTE_KIND &&
5299 Py_UNICODE_IS_SURROGATE(ch))
5300 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005301 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 q += 4;
5303 } while (q <= last);
5304 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 }
5307
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005308 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005309 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005310 startinpos = ((const char *)q) - starts;
5311 endinpos = startinpos + 4;
5312 }
5313 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005316 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 startinpos = ((const char *)q) - starts;
5319 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 else {
5322 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005323 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 goto onError;
5325 q += 4;
5326 continue;
5327 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005328 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005329 startinpos = ((const char *)q) - starts;
5330 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005332
5333 /* The remaining input chars are ignored if the callback
5334 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005335 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005337 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005339 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005341 }
5342
Walter Dörwald41980ca2007-08-16 21:55:45 +00005343 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005345
Walter Dörwald41980ca2007-08-16 21:55:45 +00005346 Py_XDECREF(errorHandler);
5347 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005348 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005349
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005351 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352 Py_XDECREF(errorHandler);
5353 Py_XDECREF(exc);
5354 return NULL;
5355}
5356
5357PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005358_PyUnicode_EncodeUTF32(PyObject *str,
5359 const char *errors,
5360 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 enum PyUnicode_Kind kind;
5363 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005365 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005366 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005367#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005372 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005373 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005374 PyObject *errorHandler = NULL;
5375 PyObject *exc = NULL;
5376 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005378 if (!PyUnicode_Check(str)) {
5379 PyErr_BadArgument();
5380 return NULL;
5381 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005382 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383 return NULL;
5384 kind = PyUnicode_KIND(str);
5385 data = PyUnicode_DATA(str);
5386 len = PyUnicode_GET_LENGTH(str);
5387
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005389 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005391 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005392 if (v == NULL)
5393 return NULL;
5394
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 /* output buffer is 4-bytes aligned */
5396 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005397 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005399 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005402
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005403 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005404 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005406 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005407 else
5408 encoding = "utf-32";
5409
5410 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005411 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5412 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005413 }
5414
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005415 pos = 0;
5416 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005417 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418
5419 if (kind == PyUnicode_2BYTE_KIND) {
5420 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5421 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005422 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 else {
5424 assert(kind == PyUnicode_4BYTE_KIND);
5425 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5426 &out, native_ordering);
5427 }
5428 if (pos == len)
5429 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005430
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 rep = unicode_encode_call_errorhandler(
5432 errors, &errorHandler,
5433 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005434 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 if (!rep)
5436 goto error;
5437
5438 if (PyBytes_Check(rep)) {
5439 repsize = PyBytes_GET_SIZE(rep);
5440 if (repsize & 3) {
5441 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005442 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005443 "surrogates not allowed");
5444 goto error;
5445 }
5446 moreunits = repsize / 4;
5447 }
5448 else {
5449 assert(PyUnicode_Check(rep));
5450 if (PyUnicode_READY(rep) < 0)
5451 goto error;
5452 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5453 if (!PyUnicode_IS_ASCII(rep)) {
5454 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005455 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005456 "surrogates not allowed");
5457 goto error;
5458 }
5459 }
5460
5461 /* four bytes are reserved for each surrogate */
5462 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005463 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 Py_ssize_t morebytes = 4 * (moreunits - 1);
5465 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5466 /* integer overflow */
5467 PyErr_NoMemory();
5468 goto error;
5469 }
5470 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5471 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005472 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 }
5474
5475 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005476 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5477 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005480 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5481 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 }
5483
5484 Py_CLEAR(rep);
5485 }
5486
5487 /* Cut back to size actually needed. This is necessary for, for example,
5488 encoding of a string containing isolated surrogates and the 'ignore'
5489 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005490 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 if (nsize != PyBytes_GET_SIZE(v))
5492 _PyBytes_Resize(&v, nsize);
5493 Py_XDECREF(errorHandler);
5494 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005495 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005496 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 error:
5498 Py_XDECREF(rep);
5499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
5501 Py_XDECREF(v);
5502 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005503}
5504
Alexander Belopolsky40018472011-02-26 01:02:56 +00005505PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005506PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5507 Py_ssize_t size,
5508 const char *errors,
5509 int byteorder)
5510{
5511 PyObject *result;
5512 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5513 if (tmp == NULL)
5514 return NULL;
5515 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5516 Py_DECREF(tmp);
5517 return result;
5518}
5519
5520PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005521PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522{
Victor Stinnerb960b342011-11-20 19:12:52 +01005523 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524}
5525
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526/* --- UTF-16 Codec ------------------------------------------------------- */
5527
Tim Peters772747b2001-08-09 22:21:55 +00005528PyObject *
5529PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 Py_ssize_t size,
5531 const char *errors,
5532 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533{
Walter Dörwald69652032004-09-07 20:24:22 +00005534 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5535}
5536
5537PyObject *
5538PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 Py_ssize_t size,
5540 const char *errors,
5541 int *byteorder,
5542 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005543{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005545 Py_ssize_t startinpos;
5546 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005547 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005548 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005549 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005550 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005551 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552 PyObject *errorHandler = NULL;
5553 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005554 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Tim Peters772747b2001-08-09 22:21:55 +00005556 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005557 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
5559 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005560 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005562 /* Check for BOM marks (U+FEFF) in the input and adjust current
5563 byte order setting accordingly. In native mode, the leading BOM
5564 mark is skipped, in all other modes, it is copied to the output
5565 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566 if (bo == 0 && size >= 2) {
5567 const Py_UCS4 bom = (q[1] << 8) | q[0];
5568 if (bom == 0xFEFF) {
5569 q += 2;
5570 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 else if (bom == 0xFFFE) {
5573 q += 2;
5574 bo = 1;
5575 }
5576 if (byteorder)
5577 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Antoine Pitrou63065d72012-05-15 23:48:04 +02005580 if (q == e) {
5581 if (consumed)
5582 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005583 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005584 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005585
Christian Heimes743e0cd2012-10-17 23:52:17 +02005586#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005588 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005589#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005591 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005592#endif
Tim Peters772747b2001-08-09 22:21:55 +00005593
Antoine Pitrou63065d72012-05-15 23:48:04 +02005594 /* Note: size will always be longer than the resulting Unicode
5595 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005596 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005597 writer.min_length = (e - q + 1) / 2;
5598 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005599 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601 while (1) {
5602 Py_UCS4 ch = 0;
5603 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005604 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005608 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609 native_ordering);
5610 else
5611 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613 native_ordering);
5614 } else if (kind == PyUnicode_2BYTE_KIND) {
5615 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005616 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617 native_ordering);
5618 } else {
5619 assert(kind == PyUnicode_4BYTE_KIND);
5620 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005623 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005624 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 switch (ch)
5627 {
5628 case 0:
5629 /* remaining byte at the end? (size should be even) */
5630 if (q == e || consumed)
5631 goto End;
5632 errmsg = "truncated data";
5633 startinpos = ((const char *)q) - starts;
5634 endinpos = ((const char *)e) - starts;
5635 break;
5636 /* The remaining input chars are ignored if the callback
5637 chooses to skip the input */
5638 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005639 q -= 2;
5640 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005641 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005643 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005644 endinpos = ((const char *)e) - starts;
5645 break;
5646 case 2:
5647 errmsg = "illegal encoding";
5648 startinpos = ((const char *)q) - 2 - starts;
5649 endinpos = startinpos + 2;
5650 break;
5651 case 3:
5652 errmsg = "illegal UTF-16 surrogate";
5653 startinpos = ((const char *)q) - 4 - starts;
5654 endinpos = startinpos + 2;
5655 break;
5656 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005657 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 continue;
5660 }
5661
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005662 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005663 errors,
5664 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005666 &starts,
5667 (const char **)&e,
5668 &startinpos,
5669 &endinpos,
5670 &exc,
5671 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005672 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 }
5675
Antoine Pitrou63065d72012-05-15 23:48:04 +02005676End:
Walter Dörwald69652032004-09-07 20:24:22 +00005677 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 Py_XDECREF(errorHandler);
5681 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005682 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005685 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 return NULL;
5689}
5690
Tim Peters772747b2001-08-09 22:21:55 +00005691PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005692_PyUnicode_EncodeUTF16(PyObject *str,
5693 const char *errors,
5694 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005696 enum PyUnicode_Kind kind;
5697 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005698 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005699 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005700 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005701 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005702#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005703 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005704#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005705 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005706#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005707 const char *encoding;
5708 Py_ssize_t nsize, pos;
5709 PyObject *errorHandler = NULL;
5710 PyObject *exc = NULL;
5711 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005712
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713 if (!PyUnicode_Check(str)) {
5714 PyErr_BadArgument();
5715 return NULL;
5716 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005717 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005718 return NULL;
5719 kind = PyUnicode_KIND(str);
5720 data = PyUnicode_DATA(str);
5721 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005722
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005723 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005724 if (kind == PyUnicode_4BYTE_KIND) {
5725 const Py_UCS4 *in = (const Py_UCS4 *)data;
5726 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005727 while (in < end) {
5728 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005730 }
5731 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005732 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005735 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 nsize = len + pairs + (byteorder == 0);
5737 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005738 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005742 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005743 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005744 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005745 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005746 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 }
5748 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005749 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 }
Tim Peters772747b2001-08-09 22:21:55 +00005751
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005752 if (kind == PyUnicode_1BYTE_KIND) {
5753 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5754 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005755 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005756
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 }
5760 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 }
5763 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005766
5767 pos = 0;
5768 while (pos < len) {
5769 Py_ssize_t repsize, moreunits;
5770
5771 if (kind == PyUnicode_2BYTE_KIND) {
5772 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5773 &out, native_ordering);
5774 }
5775 else {
5776 assert(kind == PyUnicode_4BYTE_KIND);
5777 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5778 &out, native_ordering);
5779 }
5780 if (pos == len)
5781 break;
5782
5783 rep = unicode_encode_call_errorhandler(
5784 errors, &errorHandler,
5785 encoding, "surrogates not allowed",
5786 str, &exc, pos, pos + 1, &pos);
5787 if (!rep)
5788 goto error;
5789
5790 if (PyBytes_Check(rep)) {
5791 repsize = PyBytes_GET_SIZE(rep);
5792 if (repsize & 1) {
5793 raise_encode_exception(&exc, encoding,
5794 str, pos - 1, pos,
5795 "surrogates not allowed");
5796 goto error;
5797 }
5798 moreunits = repsize / 2;
5799 }
5800 else {
5801 assert(PyUnicode_Check(rep));
5802 if (PyUnicode_READY(rep) < 0)
5803 goto error;
5804 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5805 if (!PyUnicode_IS_ASCII(rep)) {
5806 raise_encode_exception(&exc, encoding,
5807 str, pos - 1, pos,
5808 "surrogates not allowed");
5809 goto error;
5810 }
5811 }
5812
5813 /* two bytes are reserved for each surrogate */
5814 if (moreunits > 1) {
5815 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5816 Py_ssize_t morebytes = 2 * (moreunits - 1);
5817 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5818 /* integer overflow */
5819 PyErr_NoMemory();
5820 goto error;
5821 }
5822 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5823 goto error;
5824 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5825 }
5826
5827 if (PyBytes_Check(rep)) {
5828 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5829 out += moreunits;
5830 } else /* rep is unicode */ {
5831 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5832 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5833 &out, native_ordering);
5834 }
5835
5836 Py_CLEAR(rep);
5837 }
5838
5839 /* Cut back to size actually needed. This is necessary for, for example,
5840 encoding of a string containing isolated surrogates and the 'ignore' handler
5841 is used. */
5842 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5843 if (nsize != PyBytes_GET_SIZE(v))
5844 _PyBytes_Resize(&v, nsize);
5845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005847 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005848 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005849 error:
5850 Py_XDECREF(rep);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
5853 Py_XDECREF(v);
5854 return NULL;
5855#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856}
5857
Alexander Belopolsky40018472011-02-26 01:02:56 +00005858PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005859PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5860 Py_ssize_t size,
5861 const char *errors,
5862 int byteorder)
5863{
5864 PyObject *result;
5865 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5866 if (tmp == NULL)
5867 return NULL;
5868 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5869 Py_DECREF(tmp);
5870 return result;
5871}
5872
5873PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005876 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877}
5878
5879/* --- Unicode Escape Codec ----------------------------------------------- */
5880
Fredrik Lundh06d12682001-01-24 07:59:11 +00005881static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005882
Alexander Belopolsky40018472011-02-26 01:02:56 +00005883PyObject *
5884PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005885 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891 PyObject *errorHandler = NULL;
5892 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005893
Victor Stinner62ec3312016-09-06 17:04:34 -07005894 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005895 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005896 }
5897 /* Escaped strings will always be longer than the resulting
5898 Unicode string, so we start with size here and then reduce the
5899 length after conversion to the true value.
5900 (but if the error callback returns a long replacement string
5901 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005902 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005903 writer.min_length = size;
5904 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5905 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005906 }
5907
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 end = s + size;
5909 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005910 unsigned char c = (unsigned char) *s++;
5911 Py_UCS4 ch;
5912 int count;
5913 Py_ssize_t startinpos;
5914 Py_ssize_t endinpos;
5915 const char *message;
5916
5917#define WRITE_ASCII_CHAR(ch) \
5918 do { \
5919 assert(ch <= 127); \
5920 assert(writer.pos < writer.size); \
5921 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5922 } while(0)
5923
5924#define WRITE_CHAR(ch) \
5925 do { \
5926 if (ch <= writer.maxchar) { \
5927 assert(writer.pos < writer.size); \
5928 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5929 } \
5930 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5931 goto onError; \
5932 } \
5933 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934
5935 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005936 if (c != '\\') {
5937 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 continue;
5939 }
5940
Victor Stinner62ec3312016-09-06 17:04:34 -07005941 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005943 if (s >= end) {
5944 message = "\\ at end of string";
5945 goto error;
5946 }
5947 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948
Victor Stinner62ec3312016-09-06 17:04:34 -07005949 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005950 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005953 case '\n': continue;
5954 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5955 case '\'': WRITE_ASCII_CHAR('\''); continue;
5956 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5957 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005958 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005959 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5960 case 't': WRITE_ASCII_CHAR('\t'); continue;
5961 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5962 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005963 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005965 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005966 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 case '0': case '1': case '2': case '3':
5970 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005972 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 ch = (ch<<3) + *s++ - '0';
5974 if (s < end && '0' <= *s && *s <= '7') {
5975 ch = (ch<<3) + *s++ - '0';
5976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 WRITE_CHAR(ch);
5979 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* hex escapes */
5982 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005985 message = "truncated \\xXX escape";
5986 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07005990 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005991 message = "truncated \\uXXXX escape";
5992 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005995 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07005996 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005997 message = "truncated \\UXXXXXXXX escape";
5998 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006000 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 ch <<= 4;
6002 if (c >= '0' && c <= '9') {
6003 ch += c - '0';
6004 }
6005 else if (c >= 'a' && c <= 'f') {
6006 ch += c - ('a' - 10);
6007 }
6008 else if (c >= 'A' && c <= 'F') {
6009 ch += c - ('A' - 10);
6010 }
6011 else {
6012 break;
6013 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006014 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006015 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006016 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 }
6018
6019 /* when we get here, ch is a 32-bit unicode character */
6020 if (ch > MAX_UNICODE) {
6021 message = "illegal Unicode character";
6022 goto error;
6023 }
6024
6025 WRITE_CHAR(ch);
6026 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006027
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006029 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006030 if (ucnhash_CAPI == NULL) {
6031 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006032 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6033 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 if (ucnhash_CAPI == NULL) {
6035 PyErr_SetString(
6036 PyExc_UnicodeError,
6037 "\\N escapes not supported (can't load unicodedata module)"
6038 );
6039 goto onError;
6040 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006042
6043 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006044 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 const char *start = ++s;
6046 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006047 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006048 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006050 namelen = s - start;
6051 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006052 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006053 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 ch = 0xffffffff; /* in case 'getcode' messes up */
6055 if (namelen <= INT_MAX &&
6056 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6057 &ch, 0)) {
6058 assert(ch <= MAX_UNICODE);
6059 WRITE_CHAR(ch);
6060 continue;
6061 }
6062 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 }
6064 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006065 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066
6067 default:
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 WRITE_ASCII_CHAR('\\');
6069 WRITE_CHAR(c);
6070 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006072
6073 error:
6074 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006076 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006077 errors, &errorHandler,
6078 "unicodeescape", message,
6079 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006081 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 }
6083 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6084 goto onError;
6085 }
6086
6087#undef WRITE_ASCII_CHAR
6088#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006090
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006091 Py_XDECREF(errorHandler);
6092 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006093 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006094
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006096 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097 Py_XDECREF(errorHandler);
6098 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return NULL;
6100}
6101
6102/* Return a Unicode-Escape string version of the Unicode object.
6103
6104 If quotes is true, the string is enclosed in u"" or u'' quotes as
6105 appropriate.
6106
6107*/
6108
Alexander Belopolsky40018472011-02-26 01:02:56 +00006109PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006110PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006112 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006113 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006115 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006116 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006117 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118
Ezio Melottie7f90372012-10-05 03:33:31 +03006119 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006120 escape.
6121
Ezio Melottie7f90372012-10-05 03:33:31 +03006122 For UCS1 strings it's '\xxx', 4 bytes per source character.
6123 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6124 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006125 */
6126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 if (!PyUnicode_Check(unicode)) {
6128 PyErr_BadArgument();
6129 return NULL;
6130 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006131 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006132 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006133 }
Victor Stinner358af132015-10-12 22:36:57 +02006134
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006135 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006136 if (len == 0) {
6137 return PyBytes_FromStringAndSize(NULL, 0);
6138 }
6139
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 kind = PyUnicode_KIND(unicode);
6141 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6143 bytes, and 1 byte characters 4. */
6144 expandsize = kind * 2 + 2;
6145 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) {
6146 return PyErr_NoMemory();
6147 }
6148 repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1);
6149 if (repr == NULL) {
6150 return NULL;
6151 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006155 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006156
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 /* U+0000-U+00ff range */
6158 if (ch < 0x100) {
6159 if (ch >= ' ' && ch < 127) {
6160 if (ch != '\\') {
6161 /* Copy printable US ASCII as-is */
6162 *p++ = (char) ch;
6163 }
6164 /* Escape backslashes */
6165 else {
6166 *p++ = '\\';
6167 *p++ = '\\';
6168 }
6169 }
Victor Stinner358af132015-10-12 22:36:57 +02006170
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 /* Map special whitespace to '\t', \n', '\r' */
6172 else if (ch == '\t') {
6173 *p++ = '\\';
6174 *p++ = 't';
6175 }
6176 else if (ch == '\n') {
6177 *p++ = '\\';
6178 *p++ = 'n';
6179 }
6180 else if (ch == '\r') {
6181 *p++ = '\\';
6182 *p++ = 'r';
6183 }
6184
6185 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6186 else {
6187 *p++ = '\\';
6188 *p++ = 'x';
6189 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6190 *p++ = Py_hexdigits[ch & 0x000F];
6191 }
Tim Petersced69f82003-09-16 20:30:58 +00006192 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6194 else if (ch < 0x10000) {
6195 /* U+0100-U+ffff */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 *p++ = '\\';
6197 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006198 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6199 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6200 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6201 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6204 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006205
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 /* Make sure that the first two digits are zero */
6207 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006208 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006209 *p++ = 'U';
6210 *p++ = '0';
6211 *p++ = '0';
6212 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6213 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6214 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6215 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6216 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6217 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 assert(p - PyBytes_AS_STRING(repr) > 0);
6222 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6223 return NULL;
6224 }
6225 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226}
6227
Alexander Belopolsky40018472011-02-26 01:02:56 +00006228PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6230 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006232 PyObject *result;
6233 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 }
6237
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006238 result = PyUnicode_AsUnicodeEscapeString(tmp);
6239 Py_DECREF(tmp);
6240 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241}
6242
6243/* --- Raw Unicode Escape Codec ------------------------------------------- */
6244
Alexander Belopolsky40018472011-02-26 01:02:56 +00006245PyObject *
6246PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006247 Py_ssize_t size,
6248 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006251 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253 PyObject *errorHandler = NULL;
6254 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006255
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006257 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006259
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 /* Escaped strings will always be longer than the resulting
6261 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006262 length after conversion to the true value. (But decoding error
6263 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006264 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 writer.min_length = size;
6266 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6267 goto onError;
6268 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006269
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 end = s + size;
6271 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 unsigned char c = (unsigned char) *s++;
6273 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006274 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 Py_ssize_t startinpos;
6276 Py_ssize_t endinpos;
6277 const char *message;
6278
6279#define WRITE_CHAR(ch) \
6280 do { \
6281 if (ch <= writer.maxchar) { \
6282 assert(writer.pos < writer.size); \
6283 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6284 } \
6285 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6286 goto onError; \
6287 } \
6288 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006291 if (c != '\\' || s >= end) {
6292 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006294 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006295
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 c = (unsigned char) *s++;
6297 if (c == 'u') {
6298 count = 4;
6299 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 else if (c == 'U') {
6302 count = 8;
6303 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304 }
6305 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 assert(writer.pos < writer.size);
6307 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6308 WRITE_CHAR(c);
6309 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006310 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 startinpos = s - starts - 2;
6312
6313 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6314 for (ch = 0; count && s < end; ++s, --count) {
6315 c = (unsigned char)*s;
6316 ch <<= 4;
6317 if (c >= '0' && c <= '9') {
6318 ch += c - '0';
6319 }
6320 else if (c >= 'a' && c <= 'f') {
6321 ch += c - ('a' - 10);
6322 }
6323 else if (c >= 'A' && c <= 'F') {
6324 ch += c - ('A' - 10);
6325 }
6326 else {
6327 break;
6328 }
6329 }
6330 if (!count) {
6331 if (ch <= MAX_UNICODE) {
6332 WRITE_CHAR(ch);
6333 continue;
6334 }
6335 message = "\\Uxxxxxxxx out of range";
6336 }
6337
6338 endinpos = s-starts;
6339 writer.min_length = end - s + writer.pos;
6340 if (unicode_decode_call_errorhandler_writer(
6341 errors, &errorHandler,
6342 "rawunicodeescape", message,
6343 &starts, &end, &startinpos, &endinpos, &exc, &s,
6344 &writer)) {
6345 goto onError;
6346 }
6347 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6348 goto onError;
6349 }
6350
6351#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 Py_XDECREF(errorHandler);
6354 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006355 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006356
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006358 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 Py_XDECREF(errorHandler);
6360 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006362
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363}
6364
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006365
Alexander Belopolsky40018472011-02-26 01:02:56 +00006366PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006367PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368{
Victor Stinner62ec3312016-09-06 17:04:34 -07006369 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006371 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006372 int kind;
6373 void *data;
6374 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376 if (!PyUnicode_Check(unicode)) {
6377 PyErr_BadArgument();
6378 return NULL;
6379 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006381 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006382 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006383 kind = PyUnicode_KIND(unicode);
6384 data = PyUnicode_DATA(unicode);
6385 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006386 if (kind == PyUnicode_1BYTE_KIND) {
6387 return PyBytes_FromStringAndSize(data, len);
6388 }
Victor Stinner0e368262011-11-10 20:12:49 +01006389
Victor Stinner62ec3312016-09-06 17:04:34 -07006390 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6391 bytes, and 1 byte characters 4. */
6392 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006393
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 if (len > PY_SSIZE_T_MAX / expandsize) {
6395 return PyErr_NoMemory();
6396 }
6397 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6398 if (repr == NULL) {
6399 return NULL;
6400 }
6401 if (len == 0) {
6402 return repr;
6403 }
6404
6405 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 for (pos = 0; pos < len; pos++) {
6407 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006408
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6410 if (ch < 0x100) {
6411 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006412 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6414 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 *p++ = '\\';
6416 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006417 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6418 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6419 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6420 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6423 else {
6424 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6425 *p++ = '\\';
6426 *p++ = 'U';
6427 *p++ = '0';
6428 *p++ = '0';
6429 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6430 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6431 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6432 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6433 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6434 *p++ = Py_hexdigits[ch & 15];
6435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006437
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 assert(p > PyBytes_AS_STRING(repr));
6439 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6440 return NULL;
6441 }
6442 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443}
6444
Alexander Belopolsky40018472011-02-26 01:02:56 +00006445PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6447 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 PyObject *result;
6450 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6451 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006452 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006453 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6454 Py_DECREF(tmp);
6455 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456}
6457
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006458/* --- Unicode Internal Codec ------------------------------------------- */
6459
Alexander Belopolsky40018472011-02-26 01:02:56 +00006460PyObject *
6461_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006462 Py_ssize_t size,
6463 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006464{
6465 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006466 Py_ssize_t startinpos;
6467 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006468 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006469 const char *end;
6470 const char *reason;
6471 PyObject *errorHandler = NULL;
6472 PyObject *exc = NULL;
6473
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006474 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006475 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006476 1))
6477 return NULL;
6478
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006479 if (size == 0)
6480 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006481
Victor Stinner8f674cc2013-04-17 23:02:17 +02006482 _PyUnicodeWriter_Init(&writer);
6483 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6484 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006486 }
6487 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006488
Victor Stinner8f674cc2013-04-17 23:02:17 +02006489 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006490 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006491 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006492 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006493 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006494 endinpos = end-starts;
6495 reason = "truncated input";
6496 goto error;
6497 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006498 /* We copy the raw representation one byte at a time because the
6499 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006500 ((char *) &uch)[0] = s[0];
6501 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006502#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006503 ((char *) &uch)[2] = s[2];
6504 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006505#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006506 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006507#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508 /* We have to sanity check the raw data, otherwise doom looms for
6509 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006510 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006511 endinpos = s - starts + Py_UNICODE_SIZE;
6512 reason = "illegal code point (> 0x10FFFF)";
6513 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006514 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006515#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006516 s += Py_UNICODE_SIZE;
6517#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006518 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006519 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006520 Py_UNICODE uch2;
6521 ((char *) &uch2)[0] = s[0];
6522 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006523 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006524 {
Victor Stinner551ac952011-11-29 22:58:13 +01006525 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006526 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006527 }
6528 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006529#endif
6530
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006531 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006532 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006533 continue;
6534
6535 error:
6536 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006537 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006538 errors, &errorHandler,
6539 "unicode_internal", reason,
6540 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006541 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006542 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006543 }
6544
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006545 Py_XDECREF(errorHandler);
6546 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006547 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006548
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006550 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006551 Py_XDECREF(errorHandler);
6552 Py_XDECREF(exc);
6553 return NULL;
6554}
6555
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556/* --- Latin-1 Codec ------------------------------------------------------ */
6557
Alexander Belopolsky40018472011-02-26 01:02:56 +00006558PyObject *
6559PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006560 Py_ssize_t size,
6561 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006564 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565}
6566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006567/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568static void
6569make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006570 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006571 PyObject *unicode,
6572 Py_ssize_t startpos, Py_ssize_t endpos,
6573 const char *reason)
6574{
6575 if (*exceptionObject == NULL) {
6576 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006578 encoding, unicode, startpos, endpos, reason);
6579 }
6580 else {
6581 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6582 goto onError;
6583 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6584 goto onError;
6585 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6586 goto onError;
6587 return;
6588 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006589 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006590 }
6591}
6592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006594static void
6595raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006596 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006597 PyObject *unicode,
6598 Py_ssize_t startpos, Py_ssize_t endpos,
6599 const char *reason)
6600{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006601 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006602 encoding, unicode, startpos, endpos, reason);
6603 if (*exceptionObject != NULL)
6604 PyCodec_StrictErrors(*exceptionObject);
6605}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606
6607/* error handling callback helper:
6608 build arguments, call the callback and check the arguments,
6609 put the result into newpos and return the replacement string, which
6610 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611static PyObject *
6612unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 PyObject **errorHandler,
6614 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006615 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006616 Py_ssize_t startpos, Py_ssize_t endpos,
6617 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006619 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006621 PyObject *restuple;
6622 PyObject *resunicode;
6623
6624 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 }
6629
Benjamin Petersonbac79492012-01-14 13:34:47 -05006630 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006631 return NULL;
6632 len = PyUnicode_GET_LENGTH(unicode);
6633
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006634 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006635 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006638
6639 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006644 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 Py_DECREF(restuple);
6646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006648 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 &resunicode, newpos)) {
6650 Py_DECREF(restuple);
6651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006653 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6654 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6655 Py_DECREF(restuple);
6656 return NULL;
6657 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 *newpos = len + *newpos;
6660 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006661 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 Py_DECREF(restuple);
6663 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 Py_INCREF(resunicode);
6666 Py_DECREF(restuple);
6667 return resunicode;
6668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006671unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006672 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006673 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006675 /* input state */
6676 Py_ssize_t pos=0, size;
6677 int kind;
6678 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 /* pointer into the output */
6680 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006681 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6682 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006683 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006685 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006686 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006687 /* output object */
6688 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689
Benjamin Petersonbac79492012-01-14 13:34:47 -05006690 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006691 return NULL;
6692 size = PyUnicode_GET_LENGTH(unicode);
6693 kind = PyUnicode_KIND(unicode);
6694 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 /* allocate enough for a simple encoding without
6696 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006697 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006698 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006699
6700 _PyBytesWriter_Init(&writer);
6701 str = _PyBytesWriter_Alloc(&writer, size);
6702 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006703 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006705 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006706 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006709 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006711 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006715 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006718 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006720
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006721 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006723
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006724 /* Only overallocate the buffer if it's not the last write */
6725 writer.overallocate = (collend < size);
6726
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006728 if (error_handler == _Py_ERROR_UNKNOWN)
6729 error_handler = get_error_handler(errors);
6730
6731 switch (error_handler) {
6732 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006733 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006735
6736 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006737 memset(str, '?', collend - collstart);
6738 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006739 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006740 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 break;
Victor Stinner50149202015-09-22 00:26:54 +02006743
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006744 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006745 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006746 writer.min_size -= (collend - collstart);
6747 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006748 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006749 if (str == NULL)
6750 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006751 pos = collend;
6752 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006753
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006754 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006755 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006756 writer.min_size -= (collend - collstart);
6757 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006758 unicode, collstart, collend);
6759 if (str == NULL)
6760 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 break;
Victor Stinner50149202015-09-22 00:26:54 +02006763
Victor Stinnerc3713e92015-09-29 12:32:13 +02006764 case _Py_ERROR_SURROGATEESCAPE:
6765 for (i = collstart; i < collend; ++i) {
6766 ch = PyUnicode_READ(kind, data, i);
6767 if (ch < 0xdc80 || 0xdcff < ch) {
6768 /* Not a UTF-8b surrogate */
6769 break;
6770 }
6771 *str++ = (char)(ch - 0xdc00);
6772 ++pos;
6773 }
6774 if (i >= collend)
6775 break;
6776 collstart = pos;
6777 assert(collstart != collend);
6778 /* fallback to general error handling */
6779
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006781 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6782 encoding, reason, unicode, &exc,
6783 collstart, collend, &newpos);
6784 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006786
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006787 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006788 writer.min_size -= 1;
6789
Victor Stinner6bd525b2015-10-09 13:10:05 +02006790 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006791 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006792 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006793 PyBytes_AS_STRING(rep),
6794 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006795 if (str == NULL)
6796 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006797 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006798 else {
6799 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006800
Victor Stinner6bd525b2015-10-09 13:10:05 +02006801 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006803
6804 if (PyUnicode_IS_ASCII(rep)) {
6805 /* Fast path: all characters are smaller than limit */
6806 assert(limit >= 128);
6807 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6808 str = _PyBytesWriter_WriteBytes(&writer, str,
6809 PyUnicode_DATA(rep),
6810 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006812 else {
6813 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6814
6815 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6816 if (str == NULL)
6817 goto onError;
6818
6819 /* check if there is anything unencodable in the
6820 replacement and copy it to the output */
6821 for (i = 0; repsize-->0; ++i, ++str) {
6822 ch = PyUnicode_READ_CHAR(rep, i);
6823 if (ch >= limit) {
6824 raise_encode_exception(&exc, encoding, unicode,
6825 pos, pos+1, reason);
6826 goto onError;
6827 }
6828 *str = (char)ch;
6829 }
6830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006832 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006834 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006835
6836 /* If overallocation was disabled, ensure that it was the last
6837 write. Otherwise, we missed an optimization */
6838 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006839 }
6840 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006841
Victor Stinner50149202015-09-22 00:26:54 +02006842 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006844 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006845
6846 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006847 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006848 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006849 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006850 Py_XDECREF(exc);
6851 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852}
6853
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006854/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855PyObject *
6856PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006857 Py_ssize_t size,
6858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006860 PyObject *result;
6861 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6862 if (unicode == NULL)
6863 return NULL;
6864 result = unicode_encode_ucs1(unicode, errors, 256);
6865 Py_DECREF(unicode);
6866 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867}
6868
Alexander Belopolsky40018472011-02-26 01:02:56 +00006869PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006870_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871{
6872 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 PyErr_BadArgument();
6874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006876 if (PyUnicode_READY(unicode) == -1)
6877 return NULL;
6878 /* Fast path: if it is a one-byte string, construct
6879 bytes object directly. */
6880 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6881 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6882 PyUnicode_GET_LENGTH(unicode));
6883 /* Non-Latin-1 characters present. Defer to above function to
6884 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006885 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006886}
6887
6888PyObject*
6889PyUnicode_AsLatin1String(PyObject *unicode)
6890{
6891 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892}
6893
6894/* --- 7-bit ASCII Codec -------------------------------------------------- */
6895
Alexander Belopolsky40018472011-02-26 01:02:56 +00006896PyObject *
6897PyUnicode_DecodeASCII(const char *s,
6898 Py_ssize_t size,
6899 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006902 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006903 int kind;
6904 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006905 Py_ssize_t startinpos;
6906 Py_ssize_t endinpos;
6907 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006909 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006911 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006914 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006917 if (size == 1 && (unsigned char)s[0] < 128)
6918 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006919
Victor Stinner8f674cc2013-04-17 23:02:17 +02006920 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006921 writer.min_length = size;
6922 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006923 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006926 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006927 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006928 writer.pos = outpos;
6929 if (writer.pos == size)
6930 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006931
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006932 s += writer.pos;
6933 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006935 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006937 PyUnicode_WRITE(kind, data, writer.pos, c);
6938 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006940 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006942
6943 /* byte outsize range 0x00..0x7f: call the error handler */
6944
6945 if (error_handler == _Py_ERROR_UNKNOWN)
6946 error_handler = get_error_handler(errors);
6947
6948 switch (error_handler)
6949 {
6950 case _Py_ERROR_REPLACE:
6951 case _Py_ERROR_SURROGATEESCAPE:
6952 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006953 but we may switch to UCS2 at the first write */
6954 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6955 goto onError;
6956 kind = writer.kind;
6957 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006958
6959 if (error_handler == _Py_ERROR_REPLACE)
6960 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6961 else
6962 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6963 writer.pos++;
6964 ++s;
6965 break;
6966
6967 case _Py_ERROR_IGNORE:
6968 ++s;
6969 break;
6970
6971 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 startinpos = s-starts;
6973 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006974 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006975 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 "ascii", "ordinal not in range(128)",
6977 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006978 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006980 kind = writer.kind;
6981 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006984 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006986 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006987
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006989 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006990 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 return NULL;
6993}
6994
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006995/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006996PyObject *
6997PyUnicode_EncodeASCII(const Py_UNICODE *p,
6998 Py_ssize_t size,
6999 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007001 PyObject *result;
7002 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7003 if (unicode == NULL)
7004 return NULL;
7005 result = unicode_encode_ucs1(unicode, errors, 128);
7006 Py_DECREF(unicode);
7007 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008}
7009
Alexander Belopolsky40018472011-02-26 01:02:56 +00007010PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007011_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012{
7013 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 PyErr_BadArgument();
7015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007017 if (PyUnicode_READY(unicode) == -1)
7018 return NULL;
7019 /* Fast path: if it is an ASCII-only string, construct bytes object
7020 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007021 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007022 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7023 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007024 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007025}
7026
7027PyObject *
7028PyUnicode_AsASCIIString(PyObject *unicode)
7029{
7030 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031}
7032
Steve Dowercc16be82016-09-08 10:35:16 -07007033#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007034
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007035/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007036
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007037#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038#define NEED_RETRY
7039#endif
7040
Victor Stinner3a50e702011-10-18 21:21:00 +02007041#ifndef WC_ERR_INVALID_CHARS
7042# define WC_ERR_INVALID_CHARS 0x0080
7043#endif
7044
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007045static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007046code_page_name(UINT code_page, PyObject **obj)
7047{
7048 *obj = NULL;
7049 if (code_page == CP_ACP)
7050 return "mbcs";
7051 if (code_page == CP_UTF7)
7052 return "CP_UTF7";
7053 if (code_page == CP_UTF8)
7054 return "CP_UTF8";
7055
7056 *obj = PyBytes_FromFormat("cp%u", code_page);
7057 if (*obj == NULL)
7058 return NULL;
7059 return PyBytes_AS_STRING(*obj);
7060}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061
Victor Stinner3a50e702011-10-18 21:21:00 +02007062static DWORD
7063decode_code_page_flags(UINT code_page)
7064{
7065 if (code_page == CP_UTF7) {
7066 /* The CP_UTF7 decoder only supports flags=0 */
7067 return 0;
7068 }
7069 else
7070 return MB_ERR_INVALID_CHARS;
7071}
7072
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 * Decode a byte string from a Windows code page into unicode object in strict
7075 * mode.
7076 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007077 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7078 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007080static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007081decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007082 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 const char *in,
7084 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085{
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007087 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
7090 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 assert(insize > 0);
7092 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7093 if (outsize <= 0)
7094 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095
7096 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007098 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007099 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 if (*v == NULL)
7101 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103 }
7104 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007107 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 }
7111
7112 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7114 if (outsize <= 0)
7115 goto error;
7116 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007117
Victor Stinner3a50e702011-10-18 21:21:00 +02007118error:
7119 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7120 return -2;
7121 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007122 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123}
7124
Victor Stinner3a50e702011-10-18 21:21:00 +02007125/*
7126 * Decode a byte string from a code page into unicode object with an error
7127 * handler.
7128 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007129 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 * UnicodeDecodeError exception and returns -1 on error.
7131 */
7132static int
7133decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007134 PyObject **v,
7135 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007136 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007137{
7138 const char *startin = in;
7139 const char *endin = in + size;
7140 const DWORD flags = decode_code_page_flags(code_page);
7141 /* Ideally, we should get reason from FormatMessage. This is the Windows
7142 2000 English version of the message. */
7143 const char *reason = "No mapping for the Unicode character exists "
7144 "in the target code page.";
7145 /* each step cannot decode more than 1 character, but a character can be
7146 represented as a surrogate pair */
7147 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007148 int insize;
7149 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 PyObject *errorHandler = NULL;
7151 PyObject *exc = NULL;
7152 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007153 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 DWORD err;
7155 int ret = -1;
7156
7157 assert(size > 0);
7158
7159 encoding = code_page_name(code_page, &encoding_obj);
7160 if (encoding == NULL)
7161 return -1;
7162
Victor Stinner7d00cc12014-03-17 23:08:06 +01007163 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7165 UnicodeDecodeError. */
7166 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7167 if (exc != NULL) {
7168 PyCodec_StrictErrors(exc);
7169 Py_CLEAR(exc);
7170 }
7171 goto error;
7172 }
7173
7174 if (*v == NULL) {
7175 /* Create unicode object */
7176 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7177 PyErr_NoMemory();
7178 goto error;
7179 }
Victor Stinnerab595942011-12-17 04:59:06 +01007180 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007181 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 if (*v == NULL)
7183 goto error;
7184 startout = PyUnicode_AS_UNICODE(*v);
7185 }
7186 else {
7187 /* Extend unicode object */
7188 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7189 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7190 PyErr_NoMemory();
7191 goto error;
7192 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007193 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 goto error;
7195 startout = PyUnicode_AS_UNICODE(*v) + n;
7196 }
7197
7198 /* Decode the byte string character per character */
7199 out = startout;
7200 while (in < endin)
7201 {
7202 /* Decode a character */
7203 insize = 1;
7204 do
7205 {
7206 outsize = MultiByteToWideChar(code_page, flags,
7207 in, insize,
7208 buffer, Py_ARRAY_LENGTH(buffer));
7209 if (outsize > 0)
7210 break;
7211 err = GetLastError();
7212 if (err != ERROR_NO_UNICODE_TRANSLATION
7213 && err != ERROR_INSUFFICIENT_BUFFER)
7214 {
7215 PyErr_SetFromWindowsErr(0);
7216 goto error;
7217 }
7218 insize++;
7219 }
7220 /* 4=maximum length of a UTF-8 sequence */
7221 while (insize <= 4 && (in + insize) <= endin);
7222
7223 if (outsize <= 0) {
7224 Py_ssize_t startinpos, endinpos, outpos;
7225
Victor Stinner7d00cc12014-03-17 23:08:06 +01007226 /* last character in partial decode? */
7227 if (in + insize >= endin && !final)
7228 break;
7229
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 startinpos = in - startin;
7231 endinpos = startinpos + 1;
7232 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007233 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 errors, &errorHandler,
7235 encoding, reason,
7236 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007237 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 {
7239 goto error;
7240 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007241 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 }
7243 else {
7244 in += insize;
7245 memcpy(out, buffer, outsize * sizeof(wchar_t));
7246 out += outsize;
7247 }
7248 }
7249
7250 /* write a NUL character at the end */
7251 *out = 0;
7252
7253 /* Extend unicode object */
7254 outsize = out - startout;
7255 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007256 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007258 /* (in - startin) <= size and size is an int */
7259 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007260
7261error:
7262 Py_XDECREF(encoding_obj);
7263 Py_XDECREF(errorHandler);
7264 Py_XDECREF(exc);
7265 return ret;
7266}
7267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268static PyObject *
7269decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007270 const char *s, Py_ssize_t size,
7271 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272{
Victor Stinner76a31a62011-11-04 00:05:13 +01007273 PyObject *v = NULL;
7274 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 if (code_page < 0) {
7277 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7278 return NULL;
7279 }
7280
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283
Victor Stinner76a31a62011-11-04 00:05:13 +01007284 do
7285 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007287 if (size > INT_MAX) {
7288 chunk_size = INT_MAX;
7289 final = 0;
7290 done = 0;
7291 }
7292 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007294 {
7295 chunk_size = (int)size;
7296 final = (consumed == NULL);
7297 done = 1;
7298 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007299
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 if (chunk_size == 0 && done) {
7301 if (v != NULL)
7302 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007303 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007304 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 converted = decode_code_page_strict(code_page, &v,
7307 s, chunk_size);
7308 if (converted == -2)
7309 converted = decode_code_page_errors(code_page, &v,
7310 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007311 errors, final);
7312 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007313
7314 if (converted < 0) {
7315 Py_XDECREF(v);
7316 return NULL;
7317 }
7318
7319 if (consumed)
7320 *consumed += converted;
7321
7322 s += converted;
7323 size -= converted;
7324 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007325
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007326 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327}
7328
Alexander Belopolsky40018472011-02-26 01:02:56 +00007329PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007330PyUnicode_DecodeCodePageStateful(int code_page,
7331 const char *s,
7332 Py_ssize_t size,
7333 const char *errors,
7334 Py_ssize_t *consumed)
7335{
7336 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7337}
7338
7339PyObject *
7340PyUnicode_DecodeMBCSStateful(const char *s,
7341 Py_ssize_t size,
7342 const char *errors,
7343 Py_ssize_t *consumed)
7344{
7345 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7346}
7347
7348PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007349PyUnicode_DecodeMBCS(const char *s,
7350 Py_ssize_t size,
7351 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007352{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7354}
7355
Victor Stinner3a50e702011-10-18 21:21:00 +02007356static DWORD
7357encode_code_page_flags(UINT code_page, const char *errors)
7358{
7359 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007360 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 }
7362 else if (code_page == CP_UTF7) {
7363 /* CP_UTF7 only supports flags=0 */
7364 return 0;
7365 }
7366 else {
7367 if (errors != NULL && strcmp(errors, "replace") == 0)
7368 return 0;
7369 else
7370 return WC_NO_BEST_FIT_CHARS;
7371 }
7372}
7373
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007374/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 * Encode a Unicode string to a Windows code page into a byte string in strict
7376 * mode.
7377 *
7378 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007379 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007380 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007381static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007382encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007383 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385{
Victor Stinner554f3f02010-06-16 23:33:54 +00007386 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 BOOL *pusedDefaultChar = &usedDefaultChar;
7388 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007389 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007390 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 const DWORD flags = encode_code_page_flags(code_page, NULL);
7392 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007393 /* Create a substring so that we can get the UTF-16 representation
7394 of just the slice under consideration. */
7395 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007396
Martin v. Löwis3d325192011-11-04 18:23:06 +01007397 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007398
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007400 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007402 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007403
Victor Stinner2fc507f2011-11-04 20:06:39 +01007404 substring = PyUnicode_Substring(unicode, offset, offset+len);
7405 if (substring == NULL)
7406 return -1;
7407 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7408 if (p == NULL) {
7409 Py_DECREF(substring);
7410 return -1;
7411 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007412 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007413
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007414 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007416 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 NULL, 0,
7418 NULL, pusedDefaultChar);
7419 if (outsize <= 0)
7420 goto error;
7421 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 if (pusedDefaultChar && *pusedDefaultChar) {
7423 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007426
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 if (*outbytes == NULL) {
7431 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007433 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435 }
7436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 const Py_ssize_t n = PyBytes_Size(*outbytes);
7439 if (outsize > PY_SSIZE_T_MAX - n) {
7440 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007441 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007444 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7445 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007447 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449 }
7450
7451 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007453 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 out, outsize,
7455 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 if (outsize <= 0)
7458 goto error;
7459 if (pusedDefaultChar && *pusedDefaultChar)
7460 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007461 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007462
Victor Stinner3a50e702011-10-18 21:21:00 +02007463error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007464 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7466 return -2;
7467 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007468 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007469}
7470
Victor Stinner3a50e702011-10-18 21:21:00 +02007471/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007472 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 * error handler.
7474 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007475 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 * -1 on other error.
7477 */
7478static int
7479encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007480 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007482{
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 Py_ssize_t pos = unicode_offset;
7485 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 /* Ideally, we should get reason from FormatMessage. This is the Windows
7487 2000 English version of the message. */
7488 const char *reason = "invalid character";
7489 /* 4=maximum length of a UTF-8 sequence */
7490 char buffer[4];
7491 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7492 Py_ssize_t outsize;
7493 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 PyObject *errorHandler = NULL;
7495 PyObject *exc = NULL;
7496 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007497 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007498 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 PyObject *rep;
7500 int ret = -1;
7501
7502 assert(insize > 0);
7503
7504 encoding = code_page_name(code_page, &encoding_obj);
7505 if (encoding == NULL)
7506 return -1;
7507
7508 if (errors == NULL || strcmp(errors, "strict") == 0) {
7509 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7510 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007511 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 if (exc != NULL) {
7513 PyCodec_StrictErrors(exc);
7514 Py_DECREF(exc);
7515 }
7516 Py_XDECREF(encoding_obj);
7517 return -1;
7518 }
7519
7520 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7521 pusedDefaultChar = &usedDefaultChar;
7522 else
7523 pusedDefaultChar = NULL;
7524
7525 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7526 PyErr_NoMemory();
7527 goto error;
7528 }
7529 outsize = insize * Py_ARRAY_LENGTH(buffer);
7530
7531 if (*outbytes == NULL) {
7532 /* Create string object */
7533 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7534 if (*outbytes == NULL)
7535 goto error;
7536 out = PyBytes_AS_STRING(*outbytes);
7537 }
7538 else {
7539 /* Extend string object */
7540 Py_ssize_t n = PyBytes_Size(*outbytes);
7541 if (n > PY_SSIZE_T_MAX - outsize) {
7542 PyErr_NoMemory();
7543 goto error;
7544 }
7545 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7546 goto error;
7547 out = PyBytes_AS_STRING(*outbytes) + n;
7548 }
7549
7550 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007551 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007553 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7554 wchar_t chars[2];
7555 int charsize;
7556 if (ch < 0x10000) {
7557 chars[0] = (wchar_t)ch;
7558 charsize = 1;
7559 }
7560 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007561 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7562 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007563 charsize = 2;
7564 }
7565
Victor Stinner3a50e702011-10-18 21:21:00 +02007566 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007567 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 buffer, Py_ARRAY_LENGTH(buffer),
7569 NULL, pusedDefaultChar);
7570 if (outsize > 0) {
7571 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7572 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007573 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 memcpy(out, buffer, outsize);
7575 out += outsize;
7576 continue;
7577 }
7578 }
7579 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7580 PyErr_SetFromWindowsErr(0);
7581 goto error;
7582 }
7583
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 rep = unicode_encode_call_errorhandler(
7585 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007586 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007587 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 if (rep == NULL)
7589 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007590 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007591
7592 if (PyBytes_Check(rep)) {
7593 outsize = PyBytes_GET_SIZE(rep);
7594 if (outsize != 1) {
7595 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7596 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7597 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7598 Py_DECREF(rep);
7599 goto error;
7600 }
7601 out = PyBytes_AS_STRING(*outbytes) + offset;
7602 }
7603 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7604 out += outsize;
7605 }
7606 else {
7607 Py_ssize_t i;
7608 enum PyUnicode_Kind kind;
7609 void *data;
7610
Benjamin Petersonbac79492012-01-14 13:34:47 -05007611 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 Py_DECREF(rep);
7613 goto error;
7614 }
7615
7616 outsize = PyUnicode_GET_LENGTH(rep);
7617 if (outsize != 1) {
7618 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7619 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7620 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7621 Py_DECREF(rep);
7622 goto error;
7623 }
7624 out = PyBytes_AS_STRING(*outbytes) + offset;
7625 }
7626 kind = PyUnicode_KIND(rep);
7627 data = PyUnicode_DATA(rep);
7628 for (i=0; i < outsize; i++) {
7629 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7630 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007631 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007632 encoding, unicode,
7633 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 "unable to encode error handler result to ASCII");
7635 Py_DECREF(rep);
7636 goto error;
7637 }
7638 *out = (unsigned char)ch;
7639 out++;
7640 }
7641 }
7642 Py_DECREF(rep);
7643 }
7644 /* write a NUL byte */
7645 *out = 0;
7646 outsize = out - PyBytes_AS_STRING(*outbytes);
7647 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7648 if (_PyBytes_Resize(outbytes, outsize) < 0)
7649 goto error;
7650 ret = 0;
7651
7652error:
7653 Py_XDECREF(encoding_obj);
7654 Py_XDECREF(errorHandler);
7655 Py_XDECREF(exc);
7656 return ret;
7657}
7658
Victor Stinner3a50e702011-10-18 21:21:00 +02007659static PyObject *
7660encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007661 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 const char *errors)
7663{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007664 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007665 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007666 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007667 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007668
Victor Stinner29dacf22015-01-26 16:41:32 +01007669 if (!PyUnicode_Check(unicode)) {
7670 PyErr_BadArgument();
7671 return NULL;
7672 }
7673
Benjamin Petersonbac79492012-01-14 13:34:47 -05007674 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007675 return NULL;
7676 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007677
Victor Stinner3a50e702011-10-18 21:21:00 +02007678 if (code_page < 0) {
7679 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7680 return NULL;
7681 }
7682
Martin v. Löwis3d325192011-11-04 18:23:06 +01007683 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007684 return PyBytes_FromStringAndSize(NULL, 0);
7685
Victor Stinner7581cef2011-11-03 22:32:33 +01007686 offset = 0;
7687 do
7688 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007689#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007690 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 chunks. */
7692 if (len > INT_MAX/2) {
7693 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007694 done = 0;
7695 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007696 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007697#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007698 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007699 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 done = 1;
7701 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007702
Victor Stinner76a31a62011-11-04 00:05:13 +01007703 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007704 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007705 errors);
7706 if (ret == -2)
7707 ret = encode_code_page_errors(code_page, &outbytes,
7708 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007709 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007710 if (ret < 0) {
7711 Py_XDECREF(outbytes);
7712 return NULL;
7713 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007714
Victor Stinner7581cef2011-11-03 22:32:33 +01007715 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007717 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007718
Victor Stinner3a50e702011-10-18 21:21:00 +02007719 return outbytes;
7720}
7721
7722PyObject *
7723PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7724 Py_ssize_t size,
7725 const char *errors)
7726{
Victor Stinner7581cef2011-11-03 22:32:33 +01007727 PyObject *unicode, *res;
7728 unicode = PyUnicode_FromUnicode(p, size);
7729 if (unicode == NULL)
7730 return NULL;
7731 res = encode_code_page(CP_ACP, unicode, errors);
7732 Py_DECREF(unicode);
7733 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007734}
7735
7736PyObject *
7737PyUnicode_EncodeCodePage(int code_page,
7738 PyObject *unicode,
7739 const char *errors)
7740{
Victor Stinner7581cef2011-11-03 22:32:33 +01007741 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007742}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007743
Alexander Belopolsky40018472011-02-26 01:02:56 +00007744PyObject *
7745PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007746{
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007748}
7749
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007750#undef NEED_RETRY
7751
Steve Dowercc16be82016-09-08 10:35:16 -07007752#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007753
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754/* --- Character Mapping Codec -------------------------------------------- */
7755
Victor Stinnerfb161b12013-04-18 01:44:27 +02007756static int
7757charmap_decode_string(const char *s,
7758 Py_ssize_t size,
7759 PyObject *mapping,
7760 const char *errors,
7761 _PyUnicodeWriter *writer)
7762{
7763 const char *starts = s;
7764 const char *e;
7765 Py_ssize_t startinpos, endinpos;
7766 PyObject *errorHandler = NULL, *exc = NULL;
7767 Py_ssize_t maplen;
7768 enum PyUnicode_Kind mapkind;
7769 void *mapdata;
7770 Py_UCS4 x;
7771 unsigned char ch;
7772
7773 if (PyUnicode_READY(mapping) == -1)
7774 return -1;
7775
7776 maplen = PyUnicode_GET_LENGTH(mapping);
7777 mapdata = PyUnicode_DATA(mapping);
7778 mapkind = PyUnicode_KIND(mapping);
7779
7780 e = s + size;
7781
7782 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7783 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7784 * is disabled in encoding aliases, latin1 is preferred because
7785 * its implementation is faster. */
7786 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7787 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7788 Py_UCS4 maxchar = writer->maxchar;
7789
7790 assert (writer->kind == PyUnicode_1BYTE_KIND);
7791 while (s < e) {
7792 ch = *s;
7793 x = mapdata_ucs1[ch];
7794 if (x > maxchar) {
7795 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7796 goto onError;
7797 maxchar = writer->maxchar;
7798 outdata = (Py_UCS1 *)writer->data;
7799 }
7800 outdata[writer->pos] = x;
7801 writer->pos++;
7802 ++s;
7803 }
7804 return 0;
7805 }
7806
7807 while (s < e) {
7808 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7809 enum PyUnicode_Kind outkind = writer->kind;
7810 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7811 if (outkind == PyUnicode_1BYTE_KIND) {
7812 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7813 Py_UCS4 maxchar = writer->maxchar;
7814 while (s < e) {
7815 ch = *s;
7816 x = mapdata_ucs2[ch];
7817 if (x > maxchar)
7818 goto Error;
7819 outdata[writer->pos] = x;
7820 writer->pos++;
7821 ++s;
7822 }
7823 break;
7824 }
7825 else if (outkind == PyUnicode_2BYTE_KIND) {
7826 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7827 while (s < e) {
7828 ch = *s;
7829 x = mapdata_ucs2[ch];
7830 if (x == 0xFFFE)
7831 goto Error;
7832 outdata[writer->pos] = x;
7833 writer->pos++;
7834 ++s;
7835 }
7836 break;
7837 }
7838 }
7839 ch = *s;
7840
7841 if (ch < maplen)
7842 x = PyUnicode_READ(mapkind, mapdata, ch);
7843 else
7844 x = 0xfffe; /* invalid value */
7845Error:
7846 if (x == 0xfffe)
7847 {
7848 /* undefined mapping */
7849 startinpos = s-starts;
7850 endinpos = startinpos+1;
7851 if (unicode_decode_call_errorhandler_writer(
7852 errors, &errorHandler,
7853 "charmap", "character maps to <undefined>",
7854 &starts, &e, &startinpos, &endinpos, &exc, &s,
7855 writer)) {
7856 goto onError;
7857 }
7858 continue;
7859 }
7860
7861 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7862 goto onError;
7863 ++s;
7864 }
7865 Py_XDECREF(errorHandler);
7866 Py_XDECREF(exc);
7867 return 0;
7868
7869onError:
7870 Py_XDECREF(errorHandler);
7871 Py_XDECREF(exc);
7872 return -1;
7873}
7874
7875static int
7876charmap_decode_mapping(const char *s,
7877 Py_ssize_t size,
7878 PyObject *mapping,
7879 const char *errors,
7880 _PyUnicodeWriter *writer)
7881{
7882 const char *starts = s;
7883 const char *e;
7884 Py_ssize_t startinpos, endinpos;
7885 PyObject *errorHandler = NULL, *exc = NULL;
7886 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007887 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007888
7889 e = s + size;
7890
7891 while (s < e) {
7892 ch = *s;
7893
7894 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7895 key = PyLong_FromLong((long)ch);
7896 if (key == NULL)
7897 goto onError;
7898
7899 item = PyObject_GetItem(mapping, key);
7900 Py_DECREF(key);
7901 if (item == NULL) {
7902 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7903 /* No mapping found means: mapping is undefined. */
7904 PyErr_Clear();
7905 goto Undefined;
7906 } else
7907 goto onError;
7908 }
7909
7910 /* Apply mapping */
7911 if (item == Py_None)
7912 goto Undefined;
7913 if (PyLong_Check(item)) {
7914 long value = PyLong_AS_LONG(item);
7915 if (value == 0xFFFE)
7916 goto Undefined;
7917 if (value < 0 || value > MAX_UNICODE) {
7918 PyErr_Format(PyExc_TypeError,
7919 "character mapping must be in range(0x%lx)",
7920 (unsigned long)MAX_UNICODE + 1);
7921 goto onError;
7922 }
7923
7924 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7925 goto onError;
7926 }
7927 else if (PyUnicode_Check(item)) {
7928 if (PyUnicode_READY(item) == -1)
7929 goto onError;
7930 if (PyUnicode_GET_LENGTH(item) == 1) {
7931 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7932 if (value == 0xFFFE)
7933 goto Undefined;
7934 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7935 goto onError;
7936 }
7937 else {
7938 writer->overallocate = 1;
7939 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7940 goto onError;
7941 }
7942 }
7943 else {
7944 /* wrong return value */
7945 PyErr_SetString(PyExc_TypeError,
7946 "character mapping must return integer, None or str");
7947 goto onError;
7948 }
7949 Py_CLEAR(item);
7950 ++s;
7951 continue;
7952
7953Undefined:
7954 /* undefined mapping */
7955 Py_CLEAR(item);
7956 startinpos = s-starts;
7957 endinpos = startinpos+1;
7958 if (unicode_decode_call_errorhandler_writer(
7959 errors, &errorHandler,
7960 "charmap", "character maps to <undefined>",
7961 &starts, &e, &startinpos, &endinpos, &exc, &s,
7962 writer)) {
7963 goto onError;
7964 }
7965 }
7966 Py_XDECREF(errorHandler);
7967 Py_XDECREF(exc);
7968 return 0;
7969
7970onError:
7971 Py_XDECREF(item);
7972 Py_XDECREF(errorHandler);
7973 Py_XDECREF(exc);
7974 return -1;
7975}
7976
Alexander Belopolsky40018472011-02-26 01:02:56 +00007977PyObject *
7978PyUnicode_DecodeCharmap(const char *s,
7979 Py_ssize_t size,
7980 PyObject *mapping,
7981 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007983 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007984
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 /* Default to Latin-1 */
7986 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007990 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007991 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007992 writer.min_length = size;
7993 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007995
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007996 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007997 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7998 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007999 }
8000 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008001 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8002 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008004 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008005
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008007 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 return NULL;
8009}
8010
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011/* Charmap encoding: the lookup table */
8012
Alexander Belopolsky40018472011-02-26 01:02:56 +00008013struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 PyObject_HEAD
8015 unsigned char level1[32];
8016 int count2, count3;
8017 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018};
8019
8020static PyObject*
8021encoding_map_size(PyObject *obj, PyObject* args)
8022{
8023 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008024 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026}
8027
8028static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008029 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 PyDoc_STR("Return the size (in bytes) of this object") },
8031 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032};
8033
8034static void
8035encoding_map_dealloc(PyObject* o)
8036{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008037 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038}
8039
8040static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008041 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 "EncodingMap", /*tp_name*/
8043 sizeof(struct encoding_map), /*tp_basicsize*/
8044 0, /*tp_itemsize*/
8045 /* methods */
8046 encoding_map_dealloc, /*tp_dealloc*/
8047 0, /*tp_print*/
8048 0, /*tp_getattr*/
8049 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008050 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 0, /*tp_repr*/
8052 0, /*tp_as_number*/
8053 0, /*tp_as_sequence*/
8054 0, /*tp_as_mapping*/
8055 0, /*tp_hash*/
8056 0, /*tp_call*/
8057 0, /*tp_str*/
8058 0, /*tp_getattro*/
8059 0, /*tp_setattro*/
8060 0, /*tp_as_buffer*/
8061 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8062 0, /*tp_doc*/
8063 0, /*tp_traverse*/
8064 0, /*tp_clear*/
8065 0, /*tp_richcompare*/
8066 0, /*tp_weaklistoffset*/
8067 0, /*tp_iter*/
8068 0, /*tp_iternext*/
8069 encoding_map_methods, /*tp_methods*/
8070 0, /*tp_members*/
8071 0, /*tp_getset*/
8072 0, /*tp_base*/
8073 0, /*tp_dict*/
8074 0, /*tp_descr_get*/
8075 0, /*tp_descr_set*/
8076 0, /*tp_dictoffset*/
8077 0, /*tp_init*/
8078 0, /*tp_alloc*/
8079 0, /*tp_new*/
8080 0, /*tp_free*/
8081 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008082};
8083
8084PyObject*
8085PyUnicode_BuildEncodingMap(PyObject* string)
8086{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087 PyObject *result;
8088 struct encoding_map *mresult;
8089 int i;
8090 int need_dict = 0;
8091 unsigned char level1[32];
8092 unsigned char level2[512];
8093 unsigned char *mlevel1, *mlevel2, *mlevel3;
8094 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 int kind;
8096 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008097 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008100 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 PyErr_BadArgument();
8102 return NULL;
8103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 kind = PyUnicode_KIND(string);
8105 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008106 length = PyUnicode_GET_LENGTH(string);
8107 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 memset(level1, 0xFF, sizeof level1);
8109 memset(level2, 0xFF, sizeof level2);
8110
8111 /* If there isn't a one-to-one mapping of NULL to \0,
8112 or if there are non-BMP characters, we need to use
8113 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008116 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 ch = PyUnicode_READ(kind, data, i);
8119 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 need_dict = 1;
8121 break;
8122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 /* unmapped character */
8125 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 l1 = ch >> 11;
8127 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008128 if (level1[l1] == 0xFF)
8129 level1[l1] = count2++;
8130 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132 }
8133
8134 if (count2 >= 0xFF || count3 >= 0xFF)
8135 need_dict = 1;
8136
8137 if (need_dict) {
8138 PyObject *result = PyDict_New();
8139 PyObject *key, *value;
8140 if (!result)
8141 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008142 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008144 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 if (!key || !value)
8146 goto failed1;
8147 if (PyDict_SetItem(result, key, value) == -1)
8148 goto failed1;
8149 Py_DECREF(key);
8150 Py_DECREF(value);
8151 }
8152 return result;
8153 failed1:
8154 Py_XDECREF(key);
8155 Py_XDECREF(value);
8156 Py_DECREF(result);
8157 return NULL;
8158 }
8159
8160 /* Create a three-level trie */
8161 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8162 16*count2 + 128*count3 - 1);
8163 if (!result)
8164 return PyErr_NoMemory();
8165 PyObject_Init(result, &EncodingMapType);
8166 mresult = (struct encoding_map*)result;
8167 mresult->count2 = count2;
8168 mresult->count3 = count3;
8169 mlevel1 = mresult->level1;
8170 mlevel2 = mresult->level23;
8171 mlevel3 = mresult->level23 + 16*count2;
8172 memcpy(mlevel1, level1, 32);
8173 memset(mlevel2, 0xFF, 16*count2);
8174 memset(mlevel3, 0, 128*count3);
8175 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008176 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008178 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8179 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180 /* unmapped character */
8181 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008182 o1 = ch>>11;
8183 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008184 i2 = 16*mlevel1[o1] + o2;
8185 if (mlevel2[i2] == 0xFF)
8186 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008187 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008188 i3 = 128*mlevel2[i2] + o3;
8189 mlevel3[i3] = i;
8190 }
8191 return result;
8192}
8193
8194static int
Victor Stinner22168992011-11-20 17:09:18 +01008195encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196{
8197 struct encoding_map *map = (struct encoding_map*)mapping;
8198 int l1 = c>>11;
8199 int l2 = (c>>7) & 0xF;
8200 int l3 = c & 0x7F;
8201 int i;
8202
Victor Stinner22168992011-11-20 17:09:18 +01008203 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205 if (c == 0)
8206 return 0;
8207 /* level 1*/
8208 i = map->level1[l1];
8209 if (i == 0xFF) {
8210 return -1;
8211 }
8212 /* level 2*/
8213 i = map->level23[16*i+l2];
8214 if (i == 0xFF) {
8215 return -1;
8216 }
8217 /* level 3 */
8218 i = map->level23[16*map->count2 + 128*i + l3];
8219 if (i == 0) {
8220 return -1;
8221 }
8222 return i;
8223}
8224
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225/* Lookup the character ch in the mapping. If the character
8226 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008227 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008228static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008229charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230{
Christian Heimes217cfd12007-12-02 14:31:20 +00008231 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232 PyObject *x;
8233
8234 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 x = PyObject_GetItem(mapping, w);
8237 Py_DECREF(w);
8238 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8240 /* No mapping found means: mapping is undefined. */
8241 PyErr_Clear();
8242 x = Py_None;
8243 Py_INCREF(x);
8244 return x;
8245 } else
8246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008248 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008250 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 long value = PyLong_AS_LONG(x);
8252 if (value < 0 || value > 255) {
8253 PyErr_SetString(PyExc_TypeError,
8254 "character mapping must be in range(256)");
8255 Py_DECREF(x);
8256 return NULL;
8257 }
8258 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008260 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 /* wrong return value */
8264 PyErr_Format(PyExc_TypeError,
8265 "character mapping must return integer, bytes or None, not %.400s",
8266 x->ob_type->tp_name);
8267 Py_DECREF(x);
8268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
8270}
8271
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008273charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8276 /* exponentially overallocate to minimize reallocations */
8277 if (requiredsize < 2*outsize)
8278 requiredsize = 2*outsize;
8279 if (_PyBytes_Resize(outobj, requiredsize))
8280 return -1;
8281 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008282}
8283
Benjamin Peterson14339b62009-01-31 16:36:08 +00008284typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008286} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008288 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 space is available. Return a new reference to the object that
8290 was put in the output buffer, or Py_None, if the mapping was undefined
8291 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008292 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008293static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008294charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008295 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008297 PyObject *rep;
8298 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008299 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300
Christian Heimes90aa7642007-12-19 02:45:37 +00008301 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008302 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304 if (res == -1)
8305 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 if (outsize<requiredsize)
8307 if (charmapencode_resize(outobj, outpos, requiredsize))
8308 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008309 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 outstart[(*outpos)++] = (char)res;
8311 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 }
8313
8314 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 Py_DECREF(rep);
8319 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 if (PyLong_Check(rep)) {
8322 Py_ssize_t requiredsize = *outpos+1;
8323 if (outsize<requiredsize)
8324 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8325 Py_DECREF(rep);
8326 return enc_EXCEPTION;
8327 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008328 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008330 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 else {
8332 const char *repchars = PyBytes_AS_STRING(rep);
8333 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8334 Py_ssize_t requiredsize = *outpos+repsize;
8335 if (outsize<requiredsize)
8336 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8337 Py_DECREF(rep);
8338 return enc_EXCEPTION;
8339 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008340 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 memcpy(outstart + *outpos, repchars, repsize);
8342 *outpos += repsize;
8343 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 Py_DECREF(rep);
8346 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347}
8348
8349/* handle an error in PyUnicode_EncodeCharmap
8350 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008351static int
8352charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008353 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008355 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008356 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357{
8358 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008359 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008360 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008361 enum PyUnicode_Kind kind;
8362 void *data;
8363 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008365 Py_ssize_t collstartpos = *inpos;
8366 Py_ssize_t collendpos = *inpos+1;
8367 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 char *encoding = "charmap";
8369 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008371 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008372 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373
Benjamin Petersonbac79492012-01-14 13:34:47 -05008374 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008375 return -1;
8376 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 /* find all unencodable characters */
8378 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008379 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008380 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008381 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008382 val = encoding_map_lookup(ch, mapping);
8383 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 break;
8385 ++collendpos;
8386 continue;
8387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008388
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8390 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 if (rep==NULL)
8392 return -1;
8393 else if (rep!=Py_None) {
8394 Py_DECREF(rep);
8395 break;
8396 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008397 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 }
8400 /* cache callback name lookup
8401 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008402 if (*error_handler == _Py_ERROR_UNKNOWN)
8403 *error_handler = get_error_handler(errors);
8404
8405 switch (*error_handler) {
8406 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008407 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008409
8410 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 x = charmapencode_output('?', mapping, res, respos);
8413 if (x==enc_EXCEPTION) {
8414 return -1;
8415 }
8416 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008417 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return -1;
8419 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008420 }
8421 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008422 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423 *inpos = collendpos;
8424 break;
Victor Stinner50149202015-09-22 00:26:54 +02008425
8426 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427 /* generate replacement (temporarily (mis)uses p) */
8428 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 char buffer[2+29+1+1];
8430 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008431 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 for (cp = buffer; *cp; ++cp) {
8433 x = charmapencode_output(*cp, mapping, res, respos);
8434 if (x==enc_EXCEPTION)
8435 return -1;
8436 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008437 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return -1;
8439 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440 }
8441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 *inpos = collendpos;
8443 break;
Victor Stinner50149202015-09-22 00:26:54 +02008444
Benjamin Peterson14339b62009-01-31 16:36:08 +00008445 default:
Victor Stinner50149202015-09-22 00:26:54 +02008446 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008447 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008449 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008451 if (PyBytes_Check(repunicode)) {
8452 /* Directly copy bytes result to output. */
8453 Py_ssize_t outsize = PyBytes_Size(*res);
8454 Py_ssize_t requiredsize;
8455 repsize = PyBytes_Size(repunicode);
8456 requiredsize = *respos + repsize;
8457 if (requiredsize > outsize)
8458 /* Make room for all additional bytes. */
8459 if (charmapencode_resize(res, respos, requiredsize)) {
8460 Py_DECREF(repunicode);
8461 return -1;
8462 }
8463 memcpy(PyBytes_AsString(*res) + *respos,
8464 PyBytes_AsString(repunicode), repsize);
8465 *respos += repsize;
8466 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008467 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008468 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008471 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008472 Py_DECREF(repunicode);
8473 return -1;
8474 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008475 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008476 data = PyUnicode_DATA(repunicode);
8477 kind = PyUnicode_KIND(repunicode);
8478 for (index = 0; index < repsize; index++) {
8479 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8480 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008482 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 return -1;
8484 }
8485 else if (x==enc_FAILED) {
8486 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008487 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return -1;
8489 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008490 }
8491 *inpos = newpos;
8492 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 }
8494 return 0;
8495}
8496
Alexander Belopolsky40018472011-02-26 01:02:56 +00008497PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008498_PyUnicode_EncodeCharmap(PyObject *unicode,
8499 PyObject *mapping,
8500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 /* output object */
8503 PyObject *res = NULL;
8504 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008505 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008506 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008508 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008509 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008511 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008512 void *data;
8513 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514
Benjamin Petersonbac79492012-01-14 13:34:47 -05008515 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516 return NULL;
8517 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008518 data = PyUnicode_DATA(unicode);
8519 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008520
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 /* Default to Latin-1 */
8522 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008523 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 /* allocate enough for a simple encoding without
8526 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008527 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 if (res == NULL)
8529 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008530 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008534 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 if (x==enc_EXCEPTION) /* error */
8538 goto onError;
8539 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008540 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008542 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 &res, &respos)) {
8544 goto onError;
8545 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 else
8548 /* done with this character => adjust input position */
8549 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008553 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008554 if (_PyBytes_Resize(&res, respos) < 0)
8555 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008558 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 return res;
8560
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 Py_XDECREF(res);
8563 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008564 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 return NULL;
8566}
8567
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008568/* Deprecated */
8569PyObject *
8570PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8571 Py_ssize_t size,
8572 PyObject *mapping,
8573 const char *errors)
8574{
8575 PyObject *result;
8576 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8577 if (unicode == NULL)
8578 return NULL;
8579 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8580 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008581 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008582}
8583
Alexander Belopolsky40018472011-02-26 01:02:56 +00008584PyObject *
8585PyUnicode_AsCharmapString(PyObject *unicode,
8586 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587{
8588 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 PyErr_BadArgument();
8590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008592 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593}
8594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008596static void
8597make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008599 Py_ssize_t startpos, Py_ssize_t endpos,
8600 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 *exceptionObject = _PyUnicodeTranslateError_Create(
8604 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 }
8606 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8608 goto onError;
8609 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8610 goto onError;
8611 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8612 goto onError;
8613 return;
8614 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008615 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 }
8617}
8618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619/* error handling callback helper:
8620 build arguments, call the callback and check the arguments,
8621 put the result into newpos and return the replacement string, which
8622 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008623static PyObject *
8624unicode_translate_call_errorhandler(const char *errors,
8625 PyObject **errorHandler,
8626 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008628 Py_ssize_t startpos, Py_ssize_t endpos,
8629 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008631 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008633 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 PyObject *restuple;
8635 PyObject *resunicode;
8636
8637 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641 }
8642
8643 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647
8648 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008653 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 Py_DECREF(restuple);
8655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656 }
8657 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 &resunicode, &i_newpos)) {
8659 Py_DECREF(restuple);
8660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008662 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008664 else
8665 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008667 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 Py_DECREF(restuple);
8669 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008670 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 Py_INCREF(resunicode);
8672 Py_DECREF(restuple);
8673 return resunicode;
8674}
8675
8676/* Lookup the character ch in the mapping and put the result in result,
8677 which must be decrefed by the caller.
8678 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008679static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681{
Christian Heimes217cfd12007-12-02 14:31:20 +00008682 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 PyObject *x;
8684
8685 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 x = PyObject_GetItem(mapping, w);
8688 Py_DECREF(w);
8689 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8691 /* No mapping found means: use 1:1 mapping. */
8692 PyErr_Clear();
8693 *result = NULL;
8694 return 0;
8695 } else
8696 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 }
8698 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 *result = x;
8700 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008702 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008704 if (value < 0 || value > MAX_UNICODE) {
8705 PyErr_Format(PyExc_ValueError,
8706 "character mapping must be in range(0x%x)",
8707 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 Py_DECREF(x);
8709 return -1;
8710 }
8711 *result = x;
8712 return 0;
8713 }
8714 else if (PyUnicode_Check(x)) {
8715 *result = x;
8716 return 0;
8717 }
8718 else {
8719 /* wrong return value */
8720 PyErr_SetString(PyExc_TypeError,
8721 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008722 Py_DECREF(x);
8723 return -1;
8724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725}
Victor Stinner1194ea02014-04-04 19:37:40 +02008726
8727/* lookup the character, write the result into the writer.
8728 Return 1 if the result was written into the writer, return 0 if the mapping
8729 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008730static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008731charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8732 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733{
Victor Stinner1194ea02014-04-04 19:37:40 +02008734 PyObject *item;
8735
8736 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008738
8739 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008741 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008744 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008746
8747 if (item == Py_None) {
8748 Py_DECREF(item);
8749 return 0;
8750 }
8751
8752 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008753 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8754 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8755 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008756 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8757 Py_DECREF(item);
8758 return -1;
8759 }
8760 Py_DECREF(item);
8761 return 1;
8762 }
8763
8764 if (!PyUnicode_Check(item)) {
8765 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008767 }
8768
8769 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8770 Py_DECREF(item);
8771 return -1;
8772 }
8773
8774 Py_DECREF(item);
8775 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776}
8777
Victor Stinner89a76ab2014-04-05 11:44:04 +02008778static int
8779unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8780 Py_UCS1 *translate)
8781{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008782 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008783 int ret = 0;
8784
Victor Stinner89a76ab2014-04-05 11:44:04 +02008785 if (charmaptranslate_lookup(ch, mapping, &item)) {
8786 return -1;
8787 }
8788
8789 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008790 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008791 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008792 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008793 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008794 /* not found => default to 1:1 mapping */
8795 translate[ch] = ch;
8796 return 1;
8797 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008798 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008799 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008800 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8801 used it */
8802 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008803 /* invalid character or character outside ASCII:
8804 skip the fast translate */
8805 goto exit;
8806 }
8807 translate[ch] = (Py_UCS1)replace;
8808 }
8809 else if (PyUnicode_Check(item)) {
8810 Py_UCS4 replace;
8811
8812 if (PyUnicode_READY(item) == -1) {
8813 Py_DECREF(item);
8814 return -1;
8815 }
8816 if (PyUnicode_GET_LENGTH(item) != 1)
8817 goto exit;
8818
8819 replace = PyUnicode_READ_CHAR(item, 0);
8820 if (replace > 127)
8821 goto exit;
8822 translate[ch] = (Py_UCS1)replace;
8823 }
8824 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008825 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008826 goto exit;
8827 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008828 ret = 1;
8829
Benjamin Peterson1365de72014-04-07 20:15:41 -04008830 exit:
8831 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008832 return ret;
8833}
8834
8835/* Fast path for ascii => ascii translation. Return 1 if the whole string
8836 was translated into writer, return 0 if the input string was partially
8837 translated into writer, raise an exception and return -1 on error. */
8838static int
8839unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008840 _PyUnicodeWriter *writer, int ignore,
8841 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008842{
Victor Stinner872b2912014-04-05 14:27:07 +02008843 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008844 Py_ssize_t len;
8845 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008846 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008847
Victor Stinner89a76ab2014-04-05 11:44:04 +02008848 len = PyUnicode_GET_LENGTH(input);
8849
Victor Stinner872b2912014-04-05 14:27:07 +02008850 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008851
8852 in = PyUnicode_1BYTE_DATA(input);
8853 end = in + len;
8854
8855 assert(PyUnicode_IS_ASCII(writer->buffer));
8856 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8857 out = PyUnicode_1BYTE_DATA(writer->buffer);
8858
Victor Stinner872b2912014-04-05 14:27:07 +02008859 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008861 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008863 int translate = unicode_fast_translate_lookup(mapping, ch,
8864 ascii_table);
8865 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008867 if (translate == 0)
8868 goto exit;
8869 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008870 }
Victor Stinner872b2912014-04-05 14:27:07 +02008871 if (ch2 == 0xfe) {
8872 if (ignore)
8873 continue;
8874 goto exit;
8875 }
8876 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008878 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 }
Victor Stinner872b2912014-04-05 14:27:07 +02008880 res = 1;
8881
8882exit:
8883 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008884 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008885 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886}
8887
Victor Stinner3222da22015-10-01 22:07:32 +02008888static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889_PyUnicode_TranslateCharmap(PyObject *input,
8890 PyObject *mapping,
8891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008894 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 Py_ssize_t size, i;
8896 int kind;
8897 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008898 _PyUnicodeWriter writer;
8899 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008900 char *reason = "character maps to <undefined>";
8901 PyObject *errorHandler = NULL;
8902 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008903 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 PyErr_BadArgument();
8908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 if (PyUnicode_READY(input) == -1)
8912 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008913 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 kind = PyUnicode_KIND(input);
8915 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008917 if (size == 0)
8918 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008920 /* allocate enough for a simple 1:1 translation without
8921 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008922 _PyUnicodeWriter_Init(&writer);
8923 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925
Victor Stinner872b2912014-04-05 14:27:07 +02008926 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8927
Victor Stinner33798672016-03-01 21:59:58 +01008928 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008930 if (PyUnicode_IS_ASCII(input)) {
8931 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8932 if (res < 0) {
8933 _PyUnicodeWriter_Dealloc(&writer);
8934 return NULL;
8935 }
8936 if (res == 1)
8937 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008938 }
Victor Stinner33798672016-03-01 21:59:58 +01008939 else {
8940 i = 0;
8941 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008945 int translate;
8946 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8947 Py_ssize_t newpos;
8948 /* startpos for collecting untranslatable chars */
8949 Py_ssize_t collstart;
8950 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008951 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 ch = PyUnicode_READ(kind, data, i);
8954 translate = charmaptranslate_output(ch, mapping, &writer);
8955 if (translate < 0)
8956 goto onError;
8957
8958 if (translate != 0) {
8959 /* it worked => adjust input pointer */
8960 ++i;
8961 continue;
8962 }
8963
8964 /* untranslatable character */
8965 collstart = i;
8966 collend = i+1;
8967
8968 /* find all untranslatable characters */
8969 while (collend < size) {
8970 PyObject *x;
8971 ch = PyUnicode_READ(kind, data, collend);
8972 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008973 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008974 Py_XDECREF(x);
8975 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008977 ++collend;
8978 }
8979
8980 if (ignore) {
8981 i = collend;
8982 }
8983 else {
8984 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8985 reason, input, &exc,
8986 collstart, collend, &newpos);
8987 if (repunicode == NULL)
8988 goto onError;
8989 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008992 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008993 Py_DECREF(repunicode);
8994 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008995 }
8996 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008997 Py_XDECREF(exc);
8998 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008999 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009002 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009003 Py_XDECREF(exc);
9004 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 return NULL;
9006}
9007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008/* Deprecated. Use PyUnicode_Translate instead. */
9009PyObject *
9010PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9011 Py_ssize_t size,
9012 PyObject *mapping,
9013 const char *errors)
9014{
Christian Heimes5f520f42012-09-11 14:03:25 +02009015 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9017 if (!unicode)
9018 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009019 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9020 Py_DECREF(unicode);
9021 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022}
9023
Alexander Belopolsky40018472011-02-26 01:02:56 +00009024PyObject *
9025PyUnicode_Translate(PyObject *str,
9026 PyObject *mapping,
9027 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009029 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009030 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009031 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032}
Tim Petersced69f82003-09-16 20:30:58 +00009033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009035fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036{
9037 /* No need to call PyUnicode_READY(self) because this function is only
9038 called as a callback from fixup() which does it already. */
9039 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9040 const int kind = PyUnicode_KIND(self);
9041 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009042 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009043 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 Py_ssize_t i;
9045
9046 for (i = 0; i < len; ++i) {
9047 ch = PyUnicode_READ(kind, data, i);
9048 fixed = 0;
9049 if (ch > 127) {
9050 if (Py_UNICODE_ISSPACE(ch))
9051 fixed = ' ';
9052 else {
9053 const int decimal = Py_UNICODE_TODECIMAL(ch);
9054 if (decimal >= 0)
9055 fixed = '0' + decimal;
9056 }
9057 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009058 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009059 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 PyUnicode_WRITE(kind, data, i, fixed);
9061 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009062 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009063 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065 }
9066
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009067 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068}
9069
9070PyObject *
9071_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9072{
9073 if (!PyUnicode_Check(unicode)) {
9074 PyErr_BadInternalCall();
9075 return NULL;
9076 }
9077 if (PyUnicode_READY(unicode) == -1)
9078 return NULL;
9079 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9080 /* If the string is already ASCII, just return the same string */
9081 Py_INCREF(unicode);
9082 return unicode;
9083 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009084 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085}
9086
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009087PyObject *
9088PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9089 Py_ssize_t length)
9090{
Victor Stinnerf0124502011-11-21 23:12:56 +01009091 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009092 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009093 Py_UCS4 maxchar;
9094 enum PyUnicode_Kind kind;
9095 void *data;
9096
Victor Stinner99d7ad02012-02-22 13:37:39 +01009097 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009098 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009099 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009100 if (ch > 127) {
9101 int decimal = Py_UNICODE_TODECIMAL(ch);
9102 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009103 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009104 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009105 }
9106 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009107
9108 /* Copy to a new string */
9109 decimal = PyUnicode_New(length, maxchar);
9110 if (decimal == NULL)
9111 return decimal;
9112 kind = PyUnicode_KIND(decimal);
9113 data = PyUnicode_DATA(decimal);
9114 /* Iterate over code points */
9115 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009116 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009117 if (ch > 127) {
9118 int decimal = Py_UNICODE_TODECIMAL(ch);
9119 if (decimal >= 0)
9120 ch = '0' + decimal;
9121 }
9122 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009124 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009125}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009126/* --- Decimal Encoder ---------------------------------------------------- */
9127
Alexander Belopolsky40018472011-02-26 01:02:56 +00009128int
9129PyUnicode_EncodeDecimal(Py_UNICODE *s,
9130 Py_ssize_t length,
9131 char *output,
9132 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009133{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009134 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009135 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009136 enum PyUnicode_Kind kind;
9137 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009138
9139 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 PyErr_BadArgument();
9141 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009142 }
9143
Victor Stinner42bf7752011-11-21 22:52:58 +01009144 unicode = PyUnicode_FromUnicode(s, length);
9145 if (unicode == NULL)
9146 return -1;
9147
Benjamin Petersonbac79492012-01-14 13:34:47 -05009148 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009149 Py_DECREF(unicode);
9150 return -1;
9151 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009152 kind = PyUnicode_KIND(unicode);
9153 data = PyUnicode_DATA(unicode);
9154
Victor Stinnerb84d7232011-11-22 01:50:07 +01009155 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009156 PyObject *exc;
9157 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009159 Py_ssize_t startpos;
9160
9161 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009162
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009164 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009165 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 decimal = Py_UNICODE_TODECIMAL(ch);
9169 if (decimal >= 0) {
9170 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009171 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 continue;
9173 }
9174 if (0 < ch && ch < 256) {
9175 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009176 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 continue;
9178 }
Victor Stinner6345be92011-11-25 20:09:01 +01009179
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009181 exc = NULL;
9182 raise_encode_exception(&exc, "decimal", unicode,
9183 startpos, startpos+1,
9184 "invalid decimal Unicode string");
9185 Py_XDECREF(exc);
9186 Py_DECREF(unicode);
9187 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009188 }
9189 /* 0-terminate the output string */
9190 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009191 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009192 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009193}
9194
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195/* --- Helpers ------------------------------------------------------------ */
9196
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009197/* helper macro to fixup start/end slice values */
9198#define ADJUST_INDICES(start, end, len) \
9199 if (end > len) \
9200 end = len; \
9201 else if (end < 0) { \
9202 end += len; \
9203 if (end < 0) \
9204 end = 0; \
9205 } \
9206 if (start < 0) { \
9207 start += len; \
9208 if (start < 0) \
9209 start = 0; \
9210 }
9211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009213any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009215 Py_ssize_t end,
9216 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009218 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 void *buf1, *buf2;
9220 Py_ssize_t len1, len2, result;
9221
9222 kind1 = PyUnicode_KIND(s1);
9223 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224 if (kind1 < kind2)
9225 return -1;
9226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 len1 = PyUnicode_GET_LENGTH(s1);
9228 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009229 ADJUST_INDICES(start, end, len1);
9230 if (end - start < len2)
9231 return -1;
9232
9233 buf1 = PyUnicode_DATA(s1);
9234 buf2 = PyUnicode_DATA(s2);
9235 if (len2 == 1) {
9236 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9237 result = findchar((const char *)buf1 + kind1*start,
9238 kind1, end - start, ch, direction);
9239 if (result == -1)
9240 return -1;
9241 else
9242 return start + result;
9243 }
9244
9245 if (kind2 != kind1) {
9246 buf2 = _PyUnicode_AsKind(s2, kind1);
9247 if (!buf2)
9248 return -2;
9249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250
Victor Stinner794d5672011-10-10 03:21:36 +02009251 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009252 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009253 case PyUnicode_1BYTE_KIND:
9254 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9255 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9256 else
9257 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9258 break;
9259 case PyUnicode_2BYTE_KIND:
9260 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9261 break;
9262 case PyUnicode_4BYTE_KIND:
9263 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9264 break;
9265 default:
9266 assert(0); result = -2;
9267 }
9268 }
9269 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009270 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009271 case PyUnicode_1BYTE_KIND:
9272 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9273 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9274 else
9275 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9276 break;
9277 case PyUnicode_2BYTE_KIND:
9278 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9279 break;
9280 case PyUnicode_4BYTE_KIND:
9281 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9282 break;
9283 default:
9284 assert(0); result = -2;
9285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 }
9287
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009288 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 PyMem_Free(buf2);
9290
9291 return result;
9292}
9293
9294Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009295_PyUnicode_InsertThousandsGrouping(
9296 PyObject *unicode, Py_ssize_t index,
9297 Py_ssize_t n_buffer,
9298 void *digits, Py_ssize_t n_digits,
9299 Py_ssize_t min_width,
9300 const char *grouping, PyObject *thousands_sep,
9301 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302{
Victor Stinner41a863c2012-02-24 00:37:51 +01009303 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009304 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009305 Py_ssize_t thousands_sep_len;
9306 Py_ssize_t len;
9307
9308 if (unicode != NULL) {
9309 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009310 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009311 }
9312 else {
9313 kind = PyUnicode_1BYTE_KIND;
9314 data = NULL;
9315 }
9316 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9317 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9318 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9319 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009320 if (thousands_sep_kind < kind) {
9321 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9322 if (!thousands_sep_data)
9323 return -1;
9324 }
9325 else {
9326 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9327 if (!data)
9328 return -1;
9329 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 }
9331
Benjamin Petersonead6b532011-12-20 17:23:42 -06009332 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009334 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009335 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009336 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009337 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009338 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009339 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009340 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009341 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009342 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009343 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009344 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009347 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009348 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009349 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009350 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009352 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009353 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009354 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009355 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009356 break;
9357 default:
9358 assert(0);
9359 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009361 if (unicode != NULL && thousands_sep_kind != kind) {
9362 if (thousands_sep_kind < kind)
9363 PyMem_Free(thousands_sep_data);
9364 else
9365 PyMem_Free(data);
9366 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 if (unicode == NULL) {
9368 *maxchar = 127;
9369 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009370 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009371 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 }
9373 }
9374 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375}
9376
9377
Alexander Belopolsky40018472011-02-26 01:02:56 +00009378Py_ssize_t
9379PyUnicode_Count(PyObject *str,
9380 PyObject *substr,
9381 Py_ssize_t start,
9382 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009384 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009385 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 void *buf1 = NULL, *buf2 = NULL;
9387 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009388
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009389 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009391
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009392 kind1 = PyUnicode_KIND(str);
9393 kind2 = PyUnicode_KIND(substr);
9394 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009395 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009396
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009397 len1 = PyUnicode_GET_LENGTH(str);
9398 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009400 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009401 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009402
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009403 buf1 = PyUnicode_DATA(str);
9404 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009405 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009406 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009407 if (!buf2)
9408 goto onError;
9409 }
9410
9411 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009413 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009414 result = asciilib_count(
9415 ((Py_UCS1*)buf1) + start, end - start,
9416 buf2, len2, PY_SSIZE_T_MAX
9417 );
9418 else
9419 result = ucs1lib_count(
9420 ((Py_UCS1*)buf1) + start, end - start,
9421 buf2, len2, PY_SSIZE_T_MAX
9422 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 break;
9424 case PyUnicode_2BYTE_KIND:
9425 result = ucs2lib_count(
9426 ((Py_UCS2*)buf1) + start, end - start,
9427 buf2, len2, PY_SSIZE_T_MAX
9428 );
9429 break;
9430 case PyUnicode_4BYTE_KIND:
9431 result = ucs4lib_count(
9432 ((Py_UCS4*)buf1) + start, end - start,
9433 buf2, len2, PY_SSIZE_T_MAX
9434 );
9435 break;
9436 default:
9437 assert(0); result = 0;
9438 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009439
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009440 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 PyMem_Free(buf2);
9442
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009445 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 PyMem_Free(buf2);
9447 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448}
9449
Alexander Belopolsky40018472011-02-26 01:02:56 +00009450Py_ssize_t
9451PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009452 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009453 Py_ssize_t start,
9454 Py_ssize_t end,
9455 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009457 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009459
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009460 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461}
9462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463Py_ssize_t
9464PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9465 Py_ssize_t start, Py_ssize_t end,
9466 int direction)
9467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009469 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 if (PyUnicode_READY(str) == -1)
9471 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009472 if (start < 0 || end < 0) {
9473 PyErr_SetString(PyExc_IndexError, "string index out of range");
9474 return -2;
9475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 if (end > PyUnicode_GET_LENGTH(str))
9477 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009478 if (start >= end)
9479 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009481 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9482 kind, end-start, ch, direction);
9483 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009485 else
9486 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487}
9488
Alexander Belopolsky40018472011-02-26 01:02:56 +00009489static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009490tailmatch(PyObject *self,
9491 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009492 Py_ssize_t start,
9493 Py_ssize_t end,
9494 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 int kind_self;
9497 int kind_sub;
9498 void *data_self;
9499 void *data_sub;
9500 Py_ssize_t offset;
9501 Py_ssize_t i;
9502 Py_ssize_t end_sub;
9503
9504 if (PyUnicode_READY(self) == -1 ||
9505 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009506 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9509 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009513 if (PyUnicode_GET_LENGTH(substring) == 0)
9514 return 1;
9515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 kind_self = PyUnicode_KIND(self);
9517 data_self = PyUnicode_DATA(self);
9518 kind_sub = PyUnicode_KIND(substring);
9519 data_sub = PyUnicode_DATA(substring);
9520 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9521
9522 if (direction > 0)
9523 offset = end;
9524 else
9525 offset = start;
9526
9527 if (PyUnicode_READ(kind_self, data_self, offset) ==
9528 PyUnicode_READ(kind_sub, data_sub, 0) &&
9529 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9530 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9531 /* If both are of the same kind, memcmp is sufficient */
9532 if (kind_self == kind_sub) {
9533 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009534 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 data_sub,
9536 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009537 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009539 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 else {
9541 /* We do not need to compare 0 and len(substring)-1 because
9542 the if statement above ensured already that they are equal
9543 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 for (i = 1; i < end_sub; ++i) {
9545 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9546 PyUnicode_READ(kind_sub, data_sub, i))
9547 return 0;
9548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551 }
9552
9553 return 0;
9554}
9555
Alexander Belopolsky40018472011-02-26 01:02:56 +00009556Py_ssize_t
9557PyUnicode_Tailmatch(PyObject *str,
9558 PyObject *substr,
9559 Py_ssize_t start,
9560 Py_ssize_t end,
9561 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009563 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009565
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009566 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567}
9568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569/* Apply fixfct filter to the Unicode object self and return a
9570 reference to the modified object */
9571
Alexander Belopolsky40018472011-02-26 01:02:56 +00009572static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009573fixup(PyObject *self,
9574 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 PyObject *u;
9577 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009578 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009580 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009583 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 /* fix functions return the new maximum character in a string,
9586 if the kind of the resulting unicode object does not change,
9587 everything is fine. Otherwise we need to change the string kind
9588 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009589 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009590
9591 if (maxchar_new == 0) {
9592 /* no changes */;
9593 if (PyUnicode_CheckExact(self)) {
9594 Py_DECREF(u);
9595 Py_INCREF(self);
9596 return self;
9597 }
9598 else
9599 return u;
9600 }
9601
Victor Stinnere6abb482012-05-02 01:15:40 +02009602 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603
Victor Stinnereaab6042011-12-11 22:22:39 +01009604 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009606
9607 /* In case the maximum character changed, we need to
9608 convert the string to the new category. */
9609 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9610 if (v == NULL) {
9611 Py_DECREF(u);
9612 return NULL;
9613 }
9614 if (maxchar_new > maxchar_old) {
9615 /* If the maxchar increased so that the kind changed, not all
9616 characters are representable anymore and we need to fix the
9617 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009618 _PyUnicode_FastCopyCharacters(v, 0,
9619 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009620 maxchar_old = fixfct(v);
9621 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 }
9623 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009624 _PyUnicode_FastCopyCharacters(v, 0,
9625 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009627 Py_DECREF(u);
9628 assert(_PyUnicode_CheckConsistency(v, 1));
9629 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630}
9631
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632static PyObject *
9633ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9636 char *resdata, *data = PyUnicode_DATA(self);
9637 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009638
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639 res = PyUnicode_New(len, 127);
9640 if (res == NULL)
9641 return NULL;
9642 resdata = PyUnicode_DATA(res);
9643 if (lower)
9644 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009646 _Py_bytes_upper(resdata, data, len);
9647 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648}
9649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 Py_ssize_t j;
9654 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009655 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009657
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9659
9660 where ! is a negation and \p{xxx} is a character with property xxx.
9661 */
9662 for (j = i - 1; j >= 0; j--) {
9663 c = PyUnicode_READ(kind, data, j);
9664 if (!_PyUnicode_IsCaseIgnorable(c))
9665 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9668 if (final_sigma) {
9669 for (j = i + 1; j < length; j++) {
9670 c = PyUnicode_READ(kind, data, j);
9671 if (!_PyUnicode_IsCaseIgnorable(c))
9672 break;
9673 }
9674 final_sigma = j == length || !_PyUnicode_IsCased(c);
9675 }
9676 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679static int
9680lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9681 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 /* Obscure special case. */
9684 if (c == 0x3A3) {
9685 mapped[0] = handle_capital_sigma(kind, data, length, i);
9686 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689}
9690
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691static Py_ssize_t
9692do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 Py_ssize_t i, k = 0;
9695 int n_res, j;
9696 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009697
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698 c = PyUnicode_READ(kind, data, 0);
9699 n_res = _PyUnicode_ToUpperFull(c, mapped);
9700 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009701 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 for (i = 1; i < length; i++) {
9705 c = PyUnicode_READ(kind, data, i);
9706 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9707 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009708 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009710 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009711 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713}
9714
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715static Py_ssize_t
9716do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9717 Py_ssize_t i, k = 0;
9718
9719 for (i = 0; i < length; i++) {
9720 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9721 int n_res, j;
9722 if (Py_UNICODE_ISUPPER(c)) {
9723 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9724 }
9725 else if (Py_UNICODE_ISLOWER(c)) {
9726 n_res = _PyUnicode_ToUpperFull(c, mapped);
9727 }
9728 else {
9729 n_res = 1;
9730 mapped[0] = c;
9731 }
9732 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009733 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 res[k++] = mapped[j];
9735 }
9736 }
9737 return k;
9738}
9739
9740static Py_ssize_t
9741do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9742 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744 Py_ssize_t i, k = 0;
9745
9746 for (i = 0; i < length; i++) {
9747 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9748 int n_res, j;
9749 if (lower)
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 else
9752 n_res = _PyUnicode_ToUpperFull(c, mapped);
9753 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009754 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 res[k++] = mapped[j];
9756 }
9757 }
9758 return k;
9759}
9760
9761static Py_ssize_t
9762do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9763{
9764 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9765}
9766
9767static Py_ssize_t
9768do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9769{
9770 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9771}
9772
Benjamin Petersone51757f2012-01-12 21:10:29 -05009773static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009774do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9775{
9776 Py_ssize_t i, k = 0;
9777
9778 for (i = 0; i < length; i++) {
9779 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9780 Py_UCS4 mapped[3];
9781 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9782 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009783 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009784 res[k++] = mapped[j];
9785 }
9786 }
9787 return k;
9788}
9789
9790static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009791do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793 Py_ssize_t i, k = 0;
9794 int previous_is_cased;
9795
9796 previous_is_cased = 0;
9797 for (i = 0; i < length; i++) {
9798 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9799 Py_UCS4 mapped[3];
9800 int n_res, j;
9801
9802 if (previous_is_cased)
9803 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9804 else
9805 n_res = _PyUnicode_ToTitleFull(c, mapped);
9806
9807 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009808 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009809 res[k++] = mapped[j];
9810 }
9811
9812 previous_is_cased = _PyUnicode_IsCased(c);
9813 }
9814 return k;
9815}
9816
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009817static PyObject *
9818case_operation(PyObject *self,
9819 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9820{
9821 PyObject *res = NULL;
9822 Py_ssize_t length, newlength = 0;
9823 int kind, outkind;
9824 void *data, *outdata;
9825 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9826
Benjamin Petersoneea48462012-01-16 14:28:50 -05009827 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828
9829 kind = PyUnicode_KIND(self);
9830 data = PyUnicode_DATA(self);
9831 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009832 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009833 PyErr_SetString(PyExc_OverflowError, "string is too long");
9834 return NULL;
9835 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009836 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837 if (tmp == NULL)
9838 return PyErr_NoMemory();
9839 newlength = perform(kind, data, length, tmp, &maxchar);
9840 res = PyUnicode_New(newlength, maxchar);
9841 if (res == NULL)
9842 goto leave;
9843 tmpend = tmp + newlength;
9844 outdata = PyUnicode_DATA(res);
9845 outkind = PyUnicode_KIND(res);
9846 switch (outkind) {
9847 case PyUnicode_1BYTE_KIND:
9848 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9849 break;
9850 case PyUnicode_2BYTE_KIND:
9851 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9852 break;
9853 case PyUnicode_4BYTE_KIND:
9854 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9855 break;
9856 default:
9857 assert(0);
9858 break;
9859 }
9860 leave:
9861 PyMem_FREE(tmp);
9862 return res;
9863}
9864
Tim Peters8ce9f162004-08-27 01:49:32 +00009865PyObject *
9866PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009868 PyObject *res;
9869 PyObject *fseq;
9870 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009871 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009873 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009874 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009875 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009876 }
9877
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009878 /* NOTE: the following code can't call back into Python code,
9879 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009880 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009881
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009882 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009883 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009884 res = _PyUnicode_JoinArray(separator, items, seqlen);
9885 Py_DECREF(fseq);
9886 return res;
9887}
9888
9889PyObject *
9890_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9891{
9892 PyObject *res = NULL; /* the result */
9893 PyObject *sep = NULL;
9894 Py_ssize_t seplen;
9895 PyObject *item;
9896 Py_ssize_t sz, i, res_offset;
9897 Py_UCS4 maxchar;
9898 Py_UCS4 item_maxchar;
9899 int use_memcpy;
9900 unsigned char *res_data = NULL, *sep_data = NULL;
9901 PyObject *last_obj;
9902 unsigned int kind = 0;
9903
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 /* If empty sequence, return u"". */
9905 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009906 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009908
Tim Peters05eba1f2004-08-27 21:32:02 +00009909 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009910 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009911 if (seqlen == 1) {
9912 if (PyUnicode_CheckExact(items[0])) {
9913 res = items[0];
9914 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009915 return res;
9916 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009917 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009918 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009919 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009920 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009921 /* Set up sep and seplen */
9922 if (separator == NULL) {
9923 /* fall back to a blank space separator */
9924 sep = PyUnicode_FromOrdinal(' ');
9925 if (!sep)
9926 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009927 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009928 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009929 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009930 else {
9931 if (!PyUnicode_Check(separator)) {
9932 PyErr_Format(PyExc_TypeError,
9933 "separator: expected str instance,"
9934 " %.80s found",
9935 Py_TYPE(separator)->tp_name);
9936 goto onError;
9937 }
9938 if (PyUnicode_READY(separator))
9939 goto onError;
9940 sep = separator;
9941 seplen = PyUnicode_GET_LENGTH(separator);
9942 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9943 /* inc refcount to keep this code path symmetric with the
9944 above case of a blank separator */
9945 Py_INCREF(sep);
9946 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009947 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009948 }
9949
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009950 /* There are at least two things to join, or else we have a subclass
9951 * of str in the sequence.
9952 * Do a pre-pass to figure out the total amount of space we'll
9953 * need (sz), and see whether all argument are strings.
9954 */
9955 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009956#ifdef Py_DEBUG
9957 use_memcpy = 0;
9958#else
9959 use_memcpy = 1;
9960#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009961 for (i = 0; i < seqlen; i++) {
9962 const Py_ssize_t old_sz = sz;
9963 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 if (!PyUnicode_Check(item)) {
9965 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009966 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009967 " %.80s found",
9968 i, Py_TYPE(item)->tp_name);
9969 goto onError;
9970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (PyUnicode_READY(item) == -1)
9972 goto onError;
9973 sz += PyUnicode_GET_LENGTH(item);
9974 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009975 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009976 if (i != 0)
9977 sz += seplen;
9978 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9979 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009980 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009981 goto onError;
9982 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009983 if (use_memcpy && last_obj != NULL) {
9984 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9985 use_memcpy = 0;
9986 }
9987 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009988 }
Tim Petersced69f82003-09-16 20:30:58 +00009989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009991 if (res == NULL)
9992 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009993
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009994 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009995#ifdef Py_DEBUG
9996 use_memcpy = 0;
9997#else
9998 if (use_memcpy) {
9999 res_data = PyUnicode_1BYTE_DATA(res);
10000 kind = PyUnicode_KIND(res);
10001 if (seplen != 0)
10002 sep_data = PyUnicode_1BYTE_DATA(sep);
10003 }
10004#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010005 if (use_memcpy) {
10006 for (i = 0; i < seqlen; ++i) {
10007 Py_ssize_t itemlen;
10008 item = items[i];
10009
10010 /* Copy item, and maybe the separator. */
10011 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010012 Py_MEMCPY(res_data,
10013 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010014 kind * seplen);
10015 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010016 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010017
10018 itemlen = PyUnicode_GET_LENGTH(item);
10019 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010020 Py_MEMCPY(res_data,
10021 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010022 kind * itemlen);
10023 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010024 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010025 }
10026 assert(res_data == PyUnicode_1BYTE_DATA(res)
10027 + kind * PyUnicode_GET_LENGTH(res));
10028 }
10029 else {
10030 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10031 Py_ssize_t itemlen;
10032 item = items[i];
10033
10034 /* Copy item, and maybe the separator. */
10035 if (i && seplen != 0) {
10036 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10037 res_offset += seplen;
10038 }
10039
10040 itemlen = PyUnicode_GET_LENGTH(item);
10041 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010042 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010043 res_offset += itemlen;
10044 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010045 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010046 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010047 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010050 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010055 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056 return NULL;
10057}
10058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059#define FILL(kind, data, value, start, length) \
10060 do { \
10061 Py_ssize_t i_ = 0; \
10062 assert(kind != PyUnicode_WCHAR_KIND); \
10063 switch ((kind)) { \
10064 case PyUnicode_1BYTE_KIND: { \
10065 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010066 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 break; \
10068 } \
10069 case PyUnicode_2BYTE_KIND: { \
10070 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10071 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10072 break; \
10073 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010074 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10076 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10077 break; \
10078 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010079 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 } \
10081 } while (0)
10082
Victor Stinnerd3f08822012-05-29 12:57:52 +020010083void
10084_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10085 Py_UCS4 fill_char)
10086{
10087 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10088 const void *data = PyUnicode_DATA(unicode);
10089 assert(PyUnicode_IS_READY(unicode));
10090 assert(unicode_modifiable(unicode));
10091 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10092 assert(start >= 0);
10093 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10094 FILL(kind, data, fill_char, start, length);
10095}
10096
Victor Stinner3fe55312012-01-04 00:33:50 +010010097Py_ssize_t
10098PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10099 Py_UCS4 fill_char)
10100{
10101 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010102
10103 if (!PyUnicode_Check(unicode)) {
10104 PyErr_BadInternalCall();
10105 return -1;
10106 }
10107 if (PyUnicode_READY(unicode) == -1)
10108 return -1;
10109 if (unicode_check_modifiable(unicode))
10110 return -1;
10111
Victor Stinnerd3f08822012-05-29 12:57:52 +020010112 if (start < 0) {
10113 PyErr_SetString(PyExc_IndexError, "string index out of range");
10114 return -1;
10115 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010116 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10117 PyErr_SetString(PyExc_ValueError,
10118 "fill character is bigger than "
10119 "the string maximum character");
10120 return -1;
10121 }
10122
10123 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10124 length = Py_MIN(maxlen, length);
10125 if (length <= 0)
10126 return 0;
10127
Victor Stinnerd3f08822012-05-29 12:57:52 +020010128 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010129 return length;
10130}
10131
Victor Stinner9310abb2011-10-05 00:59:23 +020010132static PyObject *
10133pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010134 Py_ssize_t left,
10135 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 PyObject *u;
10139 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010140 int kind;
10141 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
10143 if (left < 0)
10144 left = 0;
10145 if (right < 0)
10146 right = 0;
10147
Victor Stinnerc4b49542011-12-11 22:44:26 +010010148 if (left == 0 && right == 0)
10149 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10152 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010153 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10154 return NULL;
10155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010157 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010159 if (!u)
10160 return NULL;
10161
10162 kind = PyUnicode_KIND(u);
10163 data = PyUnicode_DATA(u);
10164 if (left)
10165 FILL(kind, data, fill, 0, left);
10166 if (right)
10167 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010168 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010169 assert(_PyUnicode_CheckConsistency(u, 1));
10170 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171}
10172
Alexander Belopolsky40018472011-02-26 01:02:56 +000010173PyObject *
10174PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010178 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Benjamin Petersonead6b532011-12-20 17:23:42 -060010181 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 if (PyUnicode_IS_ASCII(string))
10184 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 PyUnicode_GET_LENGTH(string), keepends);
10187 else
10188 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010189 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 break;
10192 case PyUnicode_2BYTE_KIND:
10193 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 PyUnicode_GET_LENGTH(string), keepends);
10196 break;
10197 case PyUnicode_4BYTE_KIND:
10198 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 PyUnicode_GET_LENGTH(string), keepends);
10201 break;
10202 default:
10203 assert(0);
10204 list = 0;
10205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207}
10208
Alexander Belopolsky40018472011-02-26 01:02:56 +000010209static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010210split(PyObject *self,
10211 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010212 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010214 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 void *buf1, *buf2;
10216 Py_ssize_t len1, len2;
10217 PyObject* out;
10218
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010220 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 if (PyUnicode_READY(self) == -1)
10223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010226 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 if (PyUnicode_IS_ASCII(self))
10229 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010230 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 PyUnicode_GET_LENGTH(self), maxcount
10232 );
10233 else
10234 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010235 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 PyUnicode_GET_LENGTH(self), maxcount
10237 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 case PyUnicode_2BYTE_KIND:
10239 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010240 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 PyUnicode_GET_LENGTH(self), maxcount
10242 );
10243 case PyUnicode_4BYTE_KIND:
10244 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010245 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyUnicode_GET_LENGTH(self), maxcount
10247 );
10248 default:
10249 assert(0);
10250 return NULL;
10251 }
10252
10253 if (PyUnicode_READY(substring) == -1)
10254 return NULL;
10255
10256 kind1 = PyUnicode_KIND(self);
10257 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 len1 = PyUnicode_GET_LENGTH(self);
10259 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010260 if (kind1 < kind2 || len1 < len2) {
10261 out = PyList_New(1);
10262 if (out == NULL)
10263 return NULL;
10264 Py_INCREF(self);
10265 PyList_SET_ITEM(out, 0, self);
10266 return out;
10267 }
10268 buf1 = PyUnicode_DATA(self);
10269 buf2 = PyUnicode_DATA(substring);
10270 if (kind2 != kind1) {
10271 buf2 = _PyUnicode_AsKind(substring, kind1);
10272 if (!buf2)
10273 return NULL;
10274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010276 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10279 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010280 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010281 else
10282 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 break;
10285 case PyUnicode_2BYTE_KIND:
10286 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 break;
10289 case PyUnicode_4BYTE_KIND:
10290 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 break;
10293 default:
10294 out = NULL;
10295 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010296 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 PyMem_Free(buf2);
10298 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299}
10300
Alexander Belopolsky40018472011-02-26 01:02:56 +000010301static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010302rsplit(PyObject *self,
10303 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010304 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010305{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010306 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 void *buf1, *buf2;
10308 Py_ssize_t len1, len2;
10309 PyObject* out;
10310
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010311 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010312 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (PyUnicode_READY(self) == -1)
10315 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010318 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 if (PyUnicode_IS_ASCII(self))
10321 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010322 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 PyUnicode_GET_LENGTH(self), maxcount
10324 );
10325 else
10326 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 PyUnicode_GET_LENGTH(self), maxcount
10329 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 case PyUnicode_2BYTE_KIND:
10331 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 PyUnicode_GET_LENGTH(self), maxcount
10334 );
10335 case PyUnicode_4BYTE_KIND:
10336 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyUnicode_GET_LENGTH(self), maxcount
10339 );
10340 default:
10341 assert(0);
10342 return NULL;
10343 }
10344
10345 if (PyUnicode_READY(substring) == -1)
10346 return NULL;
10347
10348 kind1 = PyUnicode_KIND(self);
10349 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 len1 = PyUnicode_GET_LENGTH(self);
10351 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010352 if (kind1 < kind2 || len1 < len2) {
10353 out = PyList_New(1);
10354 if (out == NULL)
10355 return NULL;
10356 Py_INCREF(self);
10357 PyList_SET_ITEM(out, 0, self);
10358 return out;
10359 }
10360 buf1 = PyUnicode_DATA(self);
10361 buf2 = PyUnicode_DATA(substring);
10362 if (kind2 != kind1) {
10363 buf2 = _PyUnicode_AsKind(substring, kind1);
10364 if (!buf2)
10365 return NULL;
10366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010368 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10371 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010372 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373 else
10374 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 break;
10377 case PyUnicode_2BYTE_KIND:
10378 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 break;
10381 case PyUnicode_4BYTE_KIND:
10382 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010383 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 break;
10385 default:
10386 out = NULL;
10387 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010388 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 PyMem_Free(buf2);
10390 return out;
10391}
10392
10393static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10395 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010397 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10400 return asciilib_find(buf1, len1, buf2, len2, offset);
10401 else
10402 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 case PyUnicode_2BYTE_KIND:
10404 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10405 case PyUnicode_4BYTE_KIND:
10406 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10407 }
10408 assert(0);
10409 return -1;
10410}
10411
10412static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010413anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10414 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010416 switch (kind) {
10417 case PyUnicode_1BYTE_KIND:
10418 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10419 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10420 else
10421 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10422 case PyUnicode_2BYTE_KIND:
10423 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10424 case PyUnicode_4BYTE_KIND:
10425 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10426 }
10427 assert(0);
10428 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010429}
10430
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010431static void
10432replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10433 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10434{
10435 int kind = PyUnicode_KIND(u);
10436 void *data = PyUnicode_DATA(u);
10437 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10438 if (kind == PyUnicode_1BYTE_KIND) {
10439 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10440 (Py_UCS1 *)data + len,
10441 u1, u2, maxcount);
10442 }
10443 else if (kind == PyUnicode_2BYTE_KIND) {
10444 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10445 (Py_UCS2 *)data + len,
10446 u1, u2, maxcount);
10447 }
10448 else {
10449 assert(kind == PyUnicode_4BYTE_KIND);
10450 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10451 (Py_UCS4 *)data + len,
10452 u1, u2, maxcount);
10453 }
10454}
10455
Alexander Belopolsky40018472011-02-26 01:02:56 +000010456static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457replace(PyObject *self, PyObject *str1,
10458 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 PyObject *u;
10461 char *sbuf = PyUnicode_DATA(self);
10462 char *buf1 = PyUnicode_DATA(str1);
10463 char *buf2 = PyUnicode_DATA(str2);
10464 int srelease = 0, release1 = 0, release2 = 0;
10465 int skind = PyUnicode_KIND(self);
10466 int kind1 = PyUnicode_KIND(str1);
10467 int kind2 = PyUnicode_KIND(str2);
10468 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10469 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10470 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010471 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010472 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473
10474 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010475 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010477 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478
Victor Stinner59de0ee2011-10-07 10:01:28 +020010479 if (str1 == str2)
10480 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481
Victor Stinner49a0a212011-10-12 23:46:10 +020010482 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010483 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10484 if (maxchar < maxchar_str1)
10485 /* substring too wide to be present */
10486 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010487 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10488 /* Replacing str1 with str2 may cause a maxchar reduction in the
10489 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010491 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010496 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010499 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010500 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010501
Victor Stinner69ed0f42013-04-09 21:48:24 +020010502 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010503 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010504 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010505 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010506 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010510
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010511 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10512 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010513 }
10514 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 int rkind = skind;
10516 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010517 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (kind1 < rkind) {
10520 /* widen substring */
10521 buf1 = _PyUnicode_AsKind(str1, rkind);
10522 if (!buf1) goto error;
10523 release1 = 1;
10524 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010525 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 if (i < 0)
10527 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (rkind > kind2) {
10529 /* widen replacement */
10530 buf2 = _PyUnicode_AsKind(str2, rkind);
10531 if (!buf2) goto error;
10532 release2 = 1;
10533 }
10534 else if (rkind < kind2) {
10535 /* widen self and buf1 */
10536 rkind = kind2;
10537 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010538 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 sbuf = _PyUnicode_AsKind(self, rkind);
10540 if (!sbuf) goto error;
10541 srelease = 1;
10542 buf1 = _PyUnicode_AsKind(str1, rkind);
10543 if (!buf1) goto error;
10544 release1 = 1;
10545 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 u = PyUnicode_New(slen, maxchar);
10547 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010549 assert(PyUnicode_KIND(u) == rkind);
10550 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010551
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010552 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010553 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010554 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010556 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010558
10559 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010560 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010562 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010563 if (i == -1)
10564 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010565 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010567 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 }
10572 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010574 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 int rkind = skind;
10576 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 buf1 = _PyUnicode_AsKind(str1, rkind);
10581 if (!buf1) goto error;
10582 release1 = 1;
10583 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010584 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010585 if (n == 0)
10586 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 buf2 = _PyUnicode_AsKind(str2, rkind);
10590 if (!buf2) goto error;
10591 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010594 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 rkind = kind2;
10596 sbuf = _PyUnicode_AsKind(self, rkind);
10597 if (!sbuf) goto error;
10598 srelease = 1;
10599 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010600 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 buf1 = _PyUnicode_AsKind(str1, rkind);
10602 if (!buf1) goto error;
10603 release1 = 1;
10604 }
10605 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10606 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010607 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 PyErr_SetString(PyExc_OverflowError,
10609 "replace string is too long");
10610 goto error;
10611 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010612 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010614 _Py_INCREF_UNICODE_EMPTY();
10615 if (!unicode_empty)
10616 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 u = unicode_empty;
10618 goto done;
10619 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010620 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 PyErr_SetString(PyExc_OverflowError,
10622 "replace string is too long");
10623 goto error;
10624 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010625 u = PyUnicode_New(new_size, maxchar);
10626 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010628 assert(PyUnicode_KIND(u) == rkind);
10629 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 ires = i = 0;
10631 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 while (n-- > 0) {
10633 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010634 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010635 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010636 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010637 if (j == -1)
10638 break;
10639 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010640 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010641 memcpy(res + rkind * ires,
10642 sbuf + rkind * i,
10643 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 }
10646 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010650 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 memcpy(res + rkind * ires,
10658 sbuf + rkind * i,
10659 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010660 }
10661 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 /* interleave */
10663 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 if (--n <= 0)
10669 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010670 memcpy(res + rkind * ires,
10671 sbuf + rkind * i,
10672 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 ires++;
10674 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010676 memcpy(res + rkind * ires,
10677 sbuf + rkind * i,
10678 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010680 }
10681
10682 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010683 unicode_adjust_maxchar(&u);
10684 if (u == NULL)
10685 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010687
10688 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (srelease)
10690 PyMem_FREE(sbuf);
10691 if (release1)
10692 PyMem_FREE(buf1);
10693 if (release2)
10694 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010695 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697
Benjamin Peterson29060642009-01-31 22:14:21 +000010698 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (srelease)
10701 PyMem_FREE(sbuf);
10702 if (release1)
10703 PyMem_FREE(buf1);
10704 if (release2)
10705 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010706 return unicode_result_unchanged(self);
10707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 error:
10709 if (srelease && sbuf)
10710 PyMem_FREE(sbuf);
10711 if (release1 && buf1)
10712 PyMem_FREE(buf1);
10713 if (release2 && buf2)
10714 PyMem_FREE(buf2);
10715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716}
10717
10718/* --- Unicode Object Methods --------------------------------------------- */
10719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010720PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722\n\
10723Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725
10726static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010727unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010729 if (PyUnicode_READY(self) == -1)
10730 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010731 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732}
10733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010734PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010735 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736\n\
10737Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010738have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739
10740static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010741unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010743 if (PyUnicode_READY(self) == -1)
10744 return NULL;
10745 if (PyUnicode_GET_LENGTH(self) == 0)
10746 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010747 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748}
10749
Benjamin Petersond5890c82012-01-14 13:23:30 -050010750PyDoc_STRVAR(casefold__doc__,
10751 "S.casefold() -> str\n\
10752\n\
10753Return a version of S suitable for caseless comparisons.");
10754
10755static PyObject *
10756unicode_casefold(PyObject *self)
10757{
10758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
10760 if (PyUnicode_IS_ASCII(self))
10761 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010762 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010763}
10764
10765
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010766/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010767
10768static int
10769convert_uc(PyObject *obj, void *addr)
10770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010772
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010773 if (!PyUnicode_Check(obj)) {
10774 PyErr_Format(PyExc_TypeError,
10775 "The fill character must be a unicode character, "
10776 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010777 return 0;
10778 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010779 if (PyUnicode_READY(obj) < 0)
10780 return 0;
10781 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010782 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010784 return 0;
10785 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010786 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010787 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010788}
10789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010790PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010791 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010793Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010794done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795
10796static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010797unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010799 Py_ssize_t marg, left;
10800 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 Py_UCS4 fillchar = ' ';
10802
Victor Stinnere9a29352011-10-01 02:14:59 +020010803 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805
Benjamin Petersonbac79492012-01-14 13:34:47 -050010806 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807 return NULL;
10808
Victor Stinnerc4b49542011-12-11 22:44:26 +010010809 if (PyUnicode_GET_LENGTH(self) >= width)
10810 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811
Victor Stinnerc4b49542011-12-11 22:44:26 +010010812 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813 left = marg / 2 + (marg & width & 1);
10814
Victor Stinner9310abb2011-10-05 00:59:23 +020010815 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816}
10817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818/* This function assumes that str1 and str2 are readied by the caller. */
10819
Marc-André Lemburge5034372000-08-08 08:04:29 +000010820static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010821unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010822{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010823#define COMPARE(TYPE1, TYPE2) \
10824 do { \
10825 TYPE1* p1 = (TYPE1 *)data1; \
10826 TYPE2* p2 = (TYPE2 *)data2; \
10827 TYPE1* end = p1 + len; \
10828 Py_UCS4 c1, c2; \
10829 for (; p1 != end; p1++, p2++) { \
10830 c1 = *p1; \
10831 c2 = *p2; \
10832 if (c1 != c2) \
10833 return (c1 < c2) ? -1 : 1; \
10834 } \
10835 } \
10836 while (0)
10837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 int kind1, kind2;
10839 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010840 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 kind1 = PyUnicode_KIND(str1);
10843 kind2 = PyUnicode_KIND(str2);
10844 data1 = PyUnicode_DATA(str1);
10845 data2 = PyUnicode_DATA(str2);
10846 len1 = PyUnicode_GET_LENGTH(str1);
10847 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010848 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010849
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010850 switch(kind1) {
10851 case PyUnicode_1BYTE_KIND:
10852 {
10853 switch(kind2) {
10854 case PyUnicode_1BYTE_KIND:
10855 {
10856 int cmp = memcmp(data1, data2, len);
10857 /* normalize result of memcmp() into the range [-1; 1] */
10858 if (cmp < 0)
10859 return -1;
10860 if (cmp > 0)
10861 return 1;
10862 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010863 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010864 case PyUnicode_2BYTE_KIND:
10865 COMPARE(Py_UCS1, Py_UCS2);
10866 break;
10867 case PyUnicode_4BYTE_KIND:
10868 COMPARE(Py_UCS1, Py_UCS4);
10869 break;
10870 default:
10871 assert(0);
10872 }
10873 break;
10874 }
10875 case PyUnicode_2BYTE_KIND:
10876 {
10877 switch(kind2) {
10878 case PyUnicode_1BYTE_KIND:
10879 COMPARE(Py_UCS2, Py_UCS1);
10880 break;
10881 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010882 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 COMPARE(Py_UCS2, Py_UCS2);
10884 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010885 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010886 case PyUnicode_4BYTE_KIND:
10887 COMPARE(Py_UCS2, Py_UCS4);
10888 break;
10889 default:
10890 assert(0);
10891 }
10892 break;
10893 }
10894 case PyUnicode_4BYTE_KIND:
10895 {
10896 switch(kind2) {
10897 case PyUnicode_1BYTE_KIND:
10898 COMPARE(Py_UCS4, Py_UCS1);
10899 break;
10900 case PyUnicode_2BYTE_KIND:
10901 COMPARE(Py_UCS4, Py_UCS2);
10902 break;
10903 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010904 {
10905#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10906 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10907 /* normalize result of wmemcmp() into the range [-1; 1] */
10908 if (cmp < 0)
10909 return -1;
10910 if (cmp > 0)
10911 return 1;
10912#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010913 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010914#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010915 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010916 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010917 default:
10918 assert(0);
10919 }
10920 break;
10921 }
10922 default:
10923 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010924 }
10925
Victor Stinner770e19e2012-10-04 22:59:45 +020010926 if (len1 == len2)
10927 return 0;
10928 if (len1 < len2)
10929 return -1;
10930 else
10931 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010932
10933#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010934}
10935
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010936Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010937unicode_compare_eq(PyObject *str1, PyObject *str2)
10938{
10939 int kind;
10940 void *data1, *data2;
10941 Py_ssize_t len;
10942 int cmp;
10943
Victor Stinnere5567ad2012-10-23 02:48:49 +020010944 len = PyUnicode_GET_LENGTH(str1);
10945 if (PyUnicode_GET_LENGTH(str2) != len)
10946 return 0;
10947 kind = PyUnicode_KIND(str1);
10948 if (PyUnicode_KIND(str2) != kind)
10949 return 0;
10950 data1 = PyUnicode_DATA(str1);
10951 data2 = PyUnicode_DATA(str2);
10952
10953 cmp = memcmp(data1, data2, len * kind);
10954 return (cmp == 0);
10955}
10956
10957
Alexander Belopolsky40018472011-02-26 01:02:56 +000010958int
10959PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10962 if (PyUnicode_READY(left) == -1 ||
10963 PyUnicode_READY(right) == -1)
10964 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010965
10966 /* a string is equal to itself */
10967 if (left == right)
10968 return 0;
10969
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010970 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010972 PyErr_Format(PyExc_TypeError,
10973 "Can't compare %.100s and %.100s",
10974 left->ob_type->tp_name,
10975 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976 return -1;
10977}
10978
Martin v. Löwis5b222132007-06-10 09:51:05 +000010979int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010980_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10981{
10982 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10983 if (right_str == NULL)
10984 return -1;
10985 return PyUnicode_Compare(left, right_str);
10986}
10987
10988int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010989PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 Py_ssize_t i;
10992 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 Py_UCS4 chr;
10994
Victor Stinner910337b2011-10-03 03:20:16 +020010995 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 if (PyUnicode_READY(uni) == -1)
10997 return -1;
10998 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010999 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011000 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011001 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011002 size_t len, len2 = strlen(str);
11003 int cmp;
11004
11005 len = Py_MIN(len1, len2);
11006 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011007 if (cmp != 0) {
11008 if (cmp < 0)
11009 return -1;
11010 else
11011 return 1;
11012 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011013 if (len1 > len2)
11014 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011015 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011016 return -1; /* str is longer */
11017 return 0;
11018 }
11019 else {
11020 void *data = PyUnicode_DATA(uni);
11021 /* Compare Unicode string and source character set string */
11022 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011023 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011024 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11025 /* This check keeps Python strings that end in '\0' from comparing equal
11026 to C strings identical up to that point. */
11027 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11028 return 1; /* uni is longer */
11029 if (str[i])
11030 return -1; /* str is longer */
11031 return 0;
11032 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011033}
11034
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011035
Benjamin Peterson29060642009-01-31 22:14:21 +000011036#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011037 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011038
Alexander Belopolsky40018472011-02-26 01:02:56 +000011039PyObject *
11040PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011041{
11042 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011043 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011044
Victor Stinnere5567ad2012-10-23 02:48:49 +020011045 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11046 Py_RETURN_NOTIMPLEMENTED;
11047
11048 if (PyUnicode_READY(left) == -1 ||
11049 PyUnicode_READY(right) == -1)
11050 return NULL;
11051
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011052 if (left == right) {
11053 switch (op) {
11054 case Py_EQ:
11055 case Py_LE:
11056 case Py_GE:
11057 /* a string is equal to itself */
11058 v = Py_True;
11059 break;
11060 case Py_NE:
11061 case Py_LT:
11062 case Py_GT:
11063 v = Py_False;
11064 break;
11065 default:
11066 PyErr_BadArgument();
11067 return NULL;
11068 }
11069 }
11070 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011071 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011072 result ^= (op == Py_NE);
11073 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011074 }
11075 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011076 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011077
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011078 /* Convert the return value to a Boolean */
11079 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011080 case Py_LE:
11081 v = TEST_COND(result <= 0);
11082 break;
11083 case Py_GE:
11084 v = TEST_COND(result >= 0);
11085 break;
11086 case Py_LT:
11087 v = TEST_COND(result == -1);
11088 break;
11089 case Py_GT:
11090 v = TEST_COND(result == 1);
11091 break;
11092 default:
11093 PyErr_BadArgument();
11094 return NULL;
11095 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011096 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011097 Py_INCREF(v);
11098 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011099}
11100
Alexander Belopolsky40018472011-02-26 01:02:56 +000011101int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011102_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11103{
11104 return unicode_eq(aa, bb);
11105}
11106
11107int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011108PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011109{
Victor Stinner77282cb2013-04-14 19:22:47 +020011110 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 void *buf1, *buf2;
11112 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011113 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011114
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011115 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011117 "'in <string>' requires string as left operand, not %.100s",
11118 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011119 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011120 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011121 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011122 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011123 if (ensure_unicode(str) < 0)
11124 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011127 kind2 = PyUnicode_KIND(substr);
11128 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011129 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011131 len2 = PyUnicode_GET_LENGTH(substr);
11132 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011133 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011134 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011135 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011136 if (len2 == 1) {
11137 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11138 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011139 return result;
11140 }
11141 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142 buf2 = _PyUnicode_AsKind(substr, kind1);
11143 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011144 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146
Victor Stinner77282cb2013-04-14 19:22:47 +020011147 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 case PyUnicode_1BYTE_KIND:
11149 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11150 break;
11151 case PyUnicode_2BYTE_KIND:
11152 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11153 break;
11154 case PyUnicode_4BYTE_KIND:
11155 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11156 break;
11157 default:
11158 result = -1;
11159 assert(0);
11160 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011161
Victor Stinner77282cb2013-04-14 19:22:47 +020011162 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 PyMem_Free(buf2);
11164
Guido van Rossum403d68b2000-03-13 15:55:09 +000011165 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011166}
11167
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168/* Concat to string or Unicode object giving a new Unicode object. */
11169
Alexander Belopolsky40018472011-02-26 01:02:56 +000011170PyObject *
11171PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011173 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011174 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011175 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011177 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179
11180 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011181 if (left == unicode_empty)
11182 return PyUnicode_FromObject(right);
11183 if (right == unicode_empty)
11184 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011186 left_len = PyUnicode_GET_LENGTH(left);
11187 right_len = PyUnicode_GET_LENGTH(right);
11188 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011189 PyErr_SetString(PyExc_OverflowError,
11190 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011191 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011192 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011193 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011194
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011195 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11196 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011197 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 result = PyUnicode_New(new_len, maxchar);
11201 if (result == NULL)
11202 return NULL;
11203 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11204 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11205 assert(_PyUnicode_CheckConsistency(result, 1));
11206 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207}
11208
Walter Dörwald1ab83302007-05-18 17:15:44 +000011209void
Victor Stinner23e56682011-10-03 03:54:37 +020011210PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011211{
Victor Stinner23e56682011-10-03 03:54:37 +020011212 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011213 Py_UCS4 maxchar, maxchar2;
11214 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011215
11216 if (p_left == NULL) {
11217 if (!PyErr_Occurred())
11218 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011219 return;
11220 }
Victor Stinner23e56682011-10-03 03:54:37 +020011221 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011222 if (right == NULL || left == NULL
11223 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011224 if (!PyErr_Occurred())
11225 PyErr_BadInternalCall();
11226 goto error;
11227 }
11228
Benjamin Petersonbac79492012-01-14 13:34:47 -050011229 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011230 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011231 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011232 goto error;
11233
Victor Stinner488fa492011-12-12 00:01:39 +010011234 /* Shortcuts */
11235 if (left == unicode_empty) {
11236 Py_DECREF(left);
11237 Py_INCREF(right);
11238 *p_left = right;
11239 return;
11240 }
11241 if (right == unicode_empty)
11242 return;
11243
11244 left_len = PyUnicode_GET_LENGTH(left);
11245 right_len = PyUnicode_GET_LENGTH(right);
11246 if (left_len > PY_SSIZE_T_MAX - right_len) {
11247 PyErr_SetString(PyExc_OverflowError,
11248 "strings are too large to concat");
11249 goto error;
11250 }
11251 new_len = left_len + right_len;
11252
11253 if (unicode_modifiable(left)
11254 && PyUnicode_CheckExact(right)
11255 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011256 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11257 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011258 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011259 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011260 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11261 {
11262 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011263 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011264 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011265
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011266 /* copy 'right' into the newly allocated area of 'left' */
11267 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011268 }
Victor Stinner488fa492011-12-12 00:01:39 +010011269 else {
11270 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11271 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011272 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011273
Victor Stinner488fa492011-12-12 00:01:39 +010011274 /* Concat the two Unicode strings */
11275 res = PyUnicode_New(new_len, maxchar);
11276 if (res == NULL)
11277 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011278 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11279 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011280 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011281 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011282 }
11283 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011284 return;
11285
11286error:
Victor Stinner488fa492011-12-12 00:01:39 +010011287 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011288}
11289
11290void
11291PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11292{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011293 PyUnicode_Append(pleft, right);
11294 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011295}
11296
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011297/*
11298Wraps stringlib_parse_args_finds() and additionally ensures that the
11299first argument is a unicode object.
11300*/
11301
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011302static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011303parse_args_finds_unicode(const char * function_name, PyObject *args,
11304 PyObject **substring,
11305 Py_ssize_t *start, Py_ssize_t *end)
11306{
11307 if(stringlib_parse_args_finds(function_name, args, substring,
11308 start, end)) {
11309 if (ensure_unicode(*substring) < 0)
11310 return 0;
11311 return 1;
11312 }
11313 return 0;
11314}
11315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011319Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011320string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
11323static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011324unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011326 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011327 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011328 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011330 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 void *buf1, *buf2;
11332 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011334 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 kind1 = PyUnicode_KIND(self);
11338 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011339 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011340 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 len1 = PyUnicode_GET_LENGTH(self);
11343 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011345 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011346 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011347
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011348 buf1 = PyUnicode_DATA(self);
11349 buf2 = PyUnicode_DATA(substring);
11350 if (kind2 != kind1) {
11351 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011352 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011353 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011354 }
11355 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 case PyUnicode_1BYTE_KIND:
11357 iresult = ucs1lib_count(
11358 ((Py_UCS1*)buf1) + start, end - start,
11359 buf2, len2, PY_SSIZE_T_MAX
11360 );
11361 break;
11362 case PyUnicode_2BYTE_KIND:
11363 iresult = ucs2lib_count(
11364 ((Py_UCS2*)buf1) + start, end - start,
11365 buf2, len2, PY_SSIZE_T_MAX
11366 );
11367 break;
11368 case PyUnicode_4BYTE_KIND:
11369 iresult = ucs4lib_count(
11370 ((Py_UCS4*)buf1) + start, end - start,
11371 buf2, len2, PY_SSIZE_T_MAX
11372 );
11373 break;
11374 default:
11375 assert(0); iresult = 0;
11376 }
11377
11378 result = PyLong_FromSsize_t(iresult);
11379
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011380 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 return result;
11384}
11385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011386PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011387 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011389Encode S using the codec registered for encoding. Default encoding\n\
11390is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011391handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011392a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11393'xmlcharrefreplace' as well as any other name registered with\n\
11394codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
11396static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011397unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011399 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400 char *encoding = NULL;
11401 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011402
Benjamin Peterson308d6372009-09-18 21:42:35 +000011403 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11404 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011406 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011407}
11408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011409PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011410 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411\n\
11412Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
11415static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011416unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011418 Py_ssize_t i, j, line_pos, src_len, incr;
11419 Py_UCS4 ch;
11420 PyObject *u;
11421 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011422 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011424 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011425 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
Ezio Melotti745d54d2013-11-16 19:10:57 +020011427 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11428 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Antoine Pitrou22425222011-10-04 19:10:51 +020011431 if (PyUnicode_READY(self) == -1)
11432 return NULL;
11433
Thomas Wouters7e474022000-07-16 12:04:32 +000011434 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011435 src_len = PyUnicode_GET_LENGTH(self);
11436 i = j = line_pos = 0;
11437 kind = PyUnicode_KIND(self);
11438 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011439 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011440 for (; i < src_len; i++) {
11441 ch = PyUnicode_READ(kind, src_data, i);
11442 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011443 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011447 goto overflow;
11448 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011450 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011454 goto overflow;
11455 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011457 if (ch == '\n' || ch == '\r')
11458 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011460 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011461 if (!found)
11462 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011463
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466 if (!u)
11467 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011468 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
Antoine Pitroue71d5742011-10-04 15:55:09 +020011470 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 for (; i < src_len; i++) {
11473 ch = PyUnicode_READ(kind, src_data, i);
11474 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011476 incr = tabsize - (line_pos % tabsize);
11477 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011478 FILL(kind, dest_data, ' ', j, incr);
11479 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011481 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011483 line_pos++;
11484 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011485 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011486 if (ch == '\n' || ch == '\r')
11487 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011489 }
11490 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011491 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011492
Antoine Pitroue71d5742011-10-04 15:55:09 +020011493 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011494 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496}
11497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500\n\
11501Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011502such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503arguments start and end are interpreted as in slice notation.\n\
11504\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011505Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
11507static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011510 /* initialize variables to prevent gcc warning */
11511 PyObject *substring = NULL;
11512 Py_ssize_t start = 0;
11513 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011514 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011516 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011519 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011522 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (result == -2)
11525 return NULL;
11526
Christian Heimes217cfd12007-12-02 14:31:20 +000011527 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528}
11529
11530static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011531unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011533 void *data;
11534 enum PyUnicode_Kind kind;
11535 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011536
11537 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11538 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011540 }
11541 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11542 PyErr_SetString(PyExc_IndexError, "string index out of range");
11543 return NULL;
11544 }
11545 kind = PyUnicode_KIND(self);
11546 data = PyUnicode_DATA(self);
11547 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011548 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549}
11550
Guido van Rossumc2504932007-09-18 19:42:40 +000011551/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011552 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011553static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011554unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555{
Guido van Rossumc2504932007-09-18 19:42:40 +000011556 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011557 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011558
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011559#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011560 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011561#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 if (_PyUnicode_HASH(self) != -1)
11563 return _PyUnicode_HASH(self);
11564 if (PyUnicode_READY(self) == -1)
11565 return -1;
11566 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011567 /*
11568 We make the hash of the empty string be 0, rather than using
11569 (prefix ^ suffix), since this slightly obfuscates the hash secret
11570 */
11571 if (len == 0) {
11572 _PyUnicode_HASH(self) = 0;
11573 return 0;
11574 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011575 x = _Py_HashBytes(PyUnicode_DATA(self),
11576 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011578 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579}
11580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011581PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011584Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
11586static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011589 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011590 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011591 PyObject *substring = NULL;
11592 Py_ssize_t start = 0;
11593 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011595 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011598 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011601 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 if (result == -2)
11604 return NULL;
11605
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606 if (result < 0) {
11607 PyErr_SetString(PyExc_ValueError, "substring not found");
11608 return NULL;
11609 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011610
Christian Heimes217cfd12007-12-02 14:31:20 +000011611 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612}
11613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011614PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011617Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
11620static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011621unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 Py_ssize_t i, length;
11624 int kind;
11625 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626 int cased;
11627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 if (PyUnicode_READY(self) == -1)
11629 return NULL;
11630 length = PyUnicode_GET_LENGTH(self);
11631 kind = PyUnicode_KIND(self);
11632 data = PyUnicode_DATA(self);
11633
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (length == 1)
11636 return PyBool_FromLong(
11637 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011639 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011642
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 for (i = 0; i < length; i++) {
11645 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011646
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11648 return PyBool_FromLong(0);
11649 else if (!cased && Py_UNICODE_ISLOWER(ch))
11650 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011652 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653}
11654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011658Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011659at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660
11661static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011662unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 Py_ssize_t i, length;
11665 int kind;
11666 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667 int cased;
11668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 if (PyUnicode_READY(self) == -1)
11670 return NULL;
11671 length = PyUnicode_GET_LENGTH(self);
11672 kind = PyUnicode_KIND(self);
11673 data = PyUnicode_DATA(self);
11674
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if (length == 1)
11677 return PyBool_FromLong(
11678 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011680 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011683
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 for (i = 0; i < length; i++) {
11686 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011687
Benjamin Peterson29060642009-01-31 22:14:21 +000011688 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11689 return PyBool_FromLong(0);
11690 else if (!cased && Py_UNICODE_ISUPPER(ch))
11691 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011693 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694}
11695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011696PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011699Return True if S is a titlecased string and there is at least one\n\
11700character in S, i.e. upper- and titlecase characters may only\n\
11701follow uncased characters and lowercase characters only cased ones.\n\
11702Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
11704static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011705unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 Py_ssize_t i, length;
11708 int kind;
11709 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 int cased, previous_is_cased;
11711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 if (PyUnicode_READY(self) == -1)
11713 return NULL;
11714 length = PyUnicode_GET_LENGTH(self);
11715 kind = PyUnicode_KIND(self);
11716 data = PyUnicode_DATA(self);
11717
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 if (length == 1) {
11720 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11721 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11722 (Py_UNICODE_ISUPPER(ch) != 0));
11723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011725 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011727 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011728
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729 cased = 0;
11730 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 for (i = 0; i < length; i++) {
11732 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011733
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11735 if (previous_is_cased)
11736 return PyBool_FromLong(0);
11737 previous_is_cased = 1;
11738 cased = 1;
11739 }
11740 else if (Py_UNICODE_ISLOWER(ch)) {
11741 if (!previous_is_cased)
11742 return PyBool_FromLong(0);
11743 previous_is_cased = 1;
11744 cased = 1;
11745 }
11746 else
11747 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011749 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750}
11751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011752PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011755Return True if all characters in S are whitespace\n\
11756and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757
11758static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011759unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 Py_ssize_t i, length;
11762 int kind;
11763 void *data;
11764
11765 if (PyUnicode_READY(self) == -1)
11766 return NULL;
11767 length = PyUnicode_GET_LENGTH(self);
11768 kind = PyUnicode_KIND(self);
11769 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (length == 1)
11773 return PyBool_FromLong(
11774 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011776 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 for (i = 0; i < length; i++) {
11781 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011782 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011785 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786}
11787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011788PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011790\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011791Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011792and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011793
11794static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011795unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 Py_ssize_t i, length;
11798 int kind;
11799 void *data;
11800
11801 if (PyUnicode_READY(self) == -1)
11802 return NULL;
11803 length = PyUnicode_GET_LENGTH(self);
11804 kind = PyUnicode_KIND(self);
11805 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011806
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011807 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 if (length == 1)
11809 return PyBool_FromLong(
11810 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011811
11812 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 for (i = 0; i < length; i++) {
11817 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011819 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011820 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011821}
11822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011823PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011825\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011826Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011827and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011828
11829static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011830unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 int kind;
11833 void *data;
11834 Py_ssize_t len, i;
11835
11836 if (PyUnicode_READY(self) == -1)
11837 return NULL;
11838
11839 kind = PyUnicode_KIND(self);
11840 data = PyUnicode_DATA(self);
11841 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011842
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011843 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (len == 1) {
11845 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11846 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11847 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011848
11849 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 for (i = 0; i < len; i++) {
11854 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011855 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011857 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011858 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011859}
11860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011861PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011864Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011865False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866
11867static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011868unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 Py_ssize_t i, length;
11871 int kind;
11872 void *data;
11873
11874 if (PyUnicode_READY(self) == -1)
11875 return NULL;
11876 length = PyUnicode_GET_LENGTH(self);
11877 kind = PyUnicode_KIND(self);
11878 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (length == 1)
11882 return PyBool_FromLong(
11883 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011885 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 for (i = 0; i < length; i++) {
11890 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011893 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011899Return True if all characters in S are digits\n\
11900and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
11902static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011903unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 Py_ssize_t i, length;
11906 int kind;
11907 void *data;
11908
11909 if (PyUnicode_READY(self) == -1)
11910 return NULL;
11911 length = PyUnicode_GET_LENGTH(self);
11912 kind = PyUnicode_KIND(self);
11913 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (length == 1) {
11917 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11918 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011921 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011923 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 for (i = 0; i < length; i++) {
11926 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011929 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930}
11931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011932PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011935Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011936False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937
11938static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011939unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 Py_ssize_t i, length;
11942 int kind;
11943 void *data;
11944
11945 if (PyUnicode_READY(self) == -1)
11946 return NULL;
11947 length = PyUnicode_GET_LENGTH(self);
11948 kind = PyUnicode_KIND(self);
11949 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (length == 1)
11953 return PyBool_FromLong(
11954 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011956 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 for (i = 0; i < length; i++) {
11961 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965}
11966
Martin v. Löwis47383402007-08-15 07:32:56 +000011967int
11968PyUnicode_IsIdentifier(PyObject *self)
11969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 int kind;
11971 void *data;
11972 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011973 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (PyUnicode_READY(self) == -1) {
11976 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 }
11979
11980 /* Special case for empty strings */
11981 if (PyUnicode_GET_LENGTH(self) == 0)
11982 return 0;
11983 kind = PyUnicode_KIND(self);
11984 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011985
11986 /* PEP 3131 says that the first character must be in
11987 XID_Start and subsequent characters in XID_Continue,
11988 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011989 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011990 letters, digits, underscore). However, given the current
11991 definition of XID_Start and XID_Continue, it is sufficient
11992 to check just for these, except that _ must be allowed
11993 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011995 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011996 return 0;
11997
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011998 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012001 return 1;
12002}
12003
12004PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012006\n\
12007Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012008to the language definition.\n\
12009\n\
12010Use keyword.iskeyword() to test for reserved identifiers\n\
12011such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012012
12013static PyObject*
12014unicode_isidentifier(PyObject *self)
12015{
12016 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12017}
12018
Georg Brandl559e5d72008-06-11 18:37:52 +000012019PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012021\n\
12022Return True if all characters in S are considered\n\
12023printable in repr() or S is empty, False otherwise.");
12024
12025static PyObject*
12026unicode_isprintable(PyObject *self)
12027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 Py_ssize_t i, length;
12029 int kind;
12030 void *data;
12031
12032 if (PyUnicode_READY(self) == -1)
12033 return NULL;
12034 length = PyUnicode_GET_LENGTH(self);
12035 kind = PyUnicode_KIND(self);
12036 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012037
12038 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if (length == 1)
12040 return PyBool_FromLong(
12041 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 for (i = 0; i < length; i++) {
12044 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012045 Py_RETURN_FALSE;
12046 }
12047 }
12048 Py_RETURN_TRUE;
12049}
12050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012051PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012052 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053\n\
12054Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012055iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
12057static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012058unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012060 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061}
12062
Martin v. Löwis18e16552006-02-15 17:27:45 +000012063static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012064unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 if (PyUnicode_READY(self) == -1)
12067 return -1;
12068 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069}
12070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012071PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012074Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012075done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
12077static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012078unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012080 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 Py_UCS4 fillchar = ' ';
12082
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012083 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 return NULL;
12085
Benjamin Petersonbac79492012-01-14 13:34:47 -050012086 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Victor Stinnerc4b49542011-12-11 22:44:26 +010012089 if (PyUnicode_GET_LENGTH(self) >= width)
12090 return unicode_result_unchanged(self);
12091
12092 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093}
12094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012095PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012098Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
12100static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012101unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012103 if (PyUnicode_READY(self) == -1)
12104 return NULL;
12105 if (PyUnicode_IS_ASCII(self))
12106 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012107 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108}
12109
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012110#define LEFTSTRIP 0
12111#define RIGHTSTRIP 1
12112#define BOTHSTRIP 2
12113
12114/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012115static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
12117#define STRIPNAME(i) (stripformat[i]+3)
12118
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119/* externally visible for str.strip(unicode) */
12120PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012121_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 void *data;
12124 int kind;
12125 Py_ssize_t i, j, len;
12126 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012127 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12130 return NULL;
12131
12132 kind = PyUnicode_KIND(self);
12133 data = PyUnicode_DATA(self);
12134 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012135 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12137 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012138 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012139
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 i = 0;
12141 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012142 while (i < len) {
12143 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12144 if (!BLOOM(sepmask, ch))
12145 break;
12146 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12147 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 i++;
12149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151
Benjamin Peterson14339b62009-01-31 16:36:08 +000012152 j = len;
12153 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012154 j--;
12155 while (j >= i) {
12156 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12157 if (!BLOOM(sepmask, ch))
12158 break;
12159 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12160 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012161 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012162 }
12163
Benjamin Peterson29060642009-01-31 22:14:21 +000012164 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012165 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166
Victor Stinner7931d9a2011-11-04 00:22:48 +010012167 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168}
12169
12170PyObject*
12171PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12172{
12173 unsigned char *data;
12174 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012175 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176
Victor Stinnerde636f32011-10-01 03:55:54 +020012177 if (PyUnicode_READY(self) == -1)
12178 return NULL;
12179
Victor Stinner684d5fd2012-05-03 02:32:34 +020012180 length = PyUnicode_GET_LENGTH(self);
12181 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012182
Victor Stinner684d5fd2012-05-03 02:32:34 +020012183 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012184 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185
Victor Stinnerde636f32011-10-01 03:55:54 +020012186 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012187 PyErr_SetString(PyExc_IndexError, "string index out of range");
12188 return NULL;
12189 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012190 if (start >= length || end < start)
12191 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012192
Victor Stinner684d5fd2012-05-03 02:32:34 +020012193 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012194 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012195 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012196 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012197 }
12198 else {
12199 kind = PyUnicode_KIND(self);
12200 data = PyUnicode_1BYTE_DATA(self);
12201 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012202 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012203 length);
12204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
12207static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012208do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 Py_ssize_t len, i, j;
12211
12212 if (PyUnicode_READY(self) == -1)
12213 return NULL;
12214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012216
Victor Stinnercc7af722013-04-09 22:39:24 +020012217 if (PyUnicode_IS_ASCII(self)) {
12218 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12219
12220 i = 0;
12221 if (striptype != RIGHTSTRIP) {
12222 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012223 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012224 if (!_Py_ascii_whitespace[ch])
12225 break;
12226 i++;
12227 }
12228 }
12229
12230 j = len;
12231 if (striptype != LEFTSTRIP) {
12232 j--;
12233 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012234 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012235 if (!_Py_ascii_whitespace[ch])
12236 break;
12237 j--;
12238 }
12239 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012240 }
12241 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012242 else {
12243 int kind = PyUnicode_KIND(self);
12244 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012245
Victor Stinnercc7af722013-04-09 22:39:24 +020012246 i = 0;
12247 if (striptype != RIGHTSTRIP) {
12248 while (i < len) {
12249 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12250 if (!Py_UNICODE_ISSPACE(ch))
12251 break;
12252 i++;
12253 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012254 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012255
12256 j = len;
12257 if (striptype != LEFTSTRIP) {
12258 j--;
12259 while (j >= i) {
12260 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12261 if (!Py_UNICODE_ISSPACE(ch))
12262 break;
12263 j--;
12264 }
12265 j++;
12266 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012267 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012268
Victor Stinner7931d9a2011-11-04 00:22:48 +010012269 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270}
12271
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012272
12273static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012274do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012276 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
Serhiy Storchakac6792272013-10-19 21:03:34 +030012278 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012279 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012280
Benjamin Peterson14339b62009-01-31 16:36:08 +000012281 if (sep != NULL && sep != Py_None) {
12282 if (PyUnicode_Check(sep))
12283 return _PyUnicode_XStrip(self, striptype, sep);
12284 else {
12285 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 "%s arg must be None or str",
12287 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012288 return NULL;
12289 }
12290 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291
Benjamin Peterson14339b62009-01-31 16:36:08 +000012292 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293}
12294
12295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012296PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012297 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298\n\
12299Return a copy of the string S with leading and trailing\n\
12300whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012301If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302
12303static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012304unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 if (PyTuple_GET_SIZE(args) == 0)
12307 return do_strip(self, BOTHSTRIP); /* Common case */
12308 else
12309 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310}
12311
12312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012313PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012315\n\
12316Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012317If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318
12319static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012320unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012321{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012322 if (PyTuple_GET_SIZE(args) == 0)
12323 return do_strip(self, LEFTSTRIP); /* Common case */
12324 else
12325 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326}
12327
12328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012329PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012331\n\
12332Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012333If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334
12335static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012336unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012337{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012338 if (PyTuple_GET_SIZE(args) == 0)
12339 return do_strip(self, RIGHTSTRIP); /* Common case */
12340 else
12341 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342}
12343
12344
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012346unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012348 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350
Serhiy Storchaka05997252013-01-26 12:14:02 +020012351 if (len < 1)
12352 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
Victor Stinnerc4b49542011-12-11 22:44:26 +010012354 /* no repeat, return original string */
12355 if (len == 1)
12356 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012357
Benjamin Petersonbac79492012-01-14 13:34:47 -050012358 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 return NULL;
12360
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012361 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012362 PyErr_SetString(PyExc_OverflowError,
12363 "repeated string is too long");
12364 return NULL;
12365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012367
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012368 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 if (!u)
12370 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012371 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 if (PyUnicode_GET_LENGTH(str) == 1) {
12374 const int kind = PyUnicode_KIND(str);
12375 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012376 if (kind == PyUnicode_1BYTE_KIND) {
12377 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012378 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012379 }
12380 else if (kind == PyUnicode_2BYTE_KIND) {
12381 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012382 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012383 ucs2[n] = fill_char;
12384 } else {
12385 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12386 assert(kind == PyUnicode_4BYTE_KIND);
12387 for (n = 0; n < len; ++n)
12388 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 }
12391 else {
12392 /* number of characters copied this far */
12393 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012394 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 char *to = (char *) PyUnicode_DATA(u);
12396 Py_MEMCPY(to, PyUnicode_DATA(str),
12397 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 n = (done <= nchars-done) ? done : nchars-done;
12400 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012401 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403 }
12404
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012405 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012406 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407}
12408
Alexander Belopolsky40018472011-02-26 01:02:56 +000012409PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012410PyUnicode_Replace(PyObject *str,
12411 PyObject *substr,
12412 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012413 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012415 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12416 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012418 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419}
12420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012421PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012422 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423\n\
12424Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012425old replaced by new. If the optional argument count is\n\
12426given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427
12428static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 PyObject *str1;
12432 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012433 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012435 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012437 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012439 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440}
12441
Alexander Belopolsky40018472011-02-26 01:02:56 +000012442static PyObject *
12443unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012445 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 Py_ssize_t isize;
12447 Py_ssize_t osize, squote, dquote, i, o;
12448 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012449 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012453 return NULL;
12454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 isize = PyUnicode_GET_LENGTH(unicode);
12456 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 /* Compute length of output, quote characters, and
12459 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012460 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 max = 127;
12462 squote = dquote = 0;
12463 ikind = PyUnicode_KIND(unicode);
12464 for (i = 0; i < isize; i++) {
12465 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012466 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012468 case '\'': squote++; break;
12469 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012471 incr = 2;
12472 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 default:
12474 /* Fast-path ASCII */
12475 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012476 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012478 ;
12479 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012482 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012484 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012486 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012488 if (osize > PY_SSIZE_T_MAX - incr) {
12489 PyErr_SetString(PyExc_OverflowError,
12490 "string is too long to generate repr");
12491 return NULL;
12492 }
12493 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 }
12495
12496 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012497 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012499 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 if (dquote)
12501 /* Both squote and dquote present. Use squote,
12502 and escape them */
12503 osize += squote;
12504 else
12505 quote = '"';
12506 }
Victor Stinner55c08782013-04-14 18:45:39 +020012507 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508
12509 repr = PyUnicode_New(osize, max);
12510 if (repr == NULL)
12511 return NULL;
12512 okind = PyUnicode_KIND(repr);
12513 odata = PyUnicode_DATA(repr);
12514
12515 PyUnicode_WRITE(okind, odata, 0, quote);
12516 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012517 if (unchanged) {
12518 _PyUnicode_FastCopyCharacters(repr, 1,
12519 unicode, 0,
12520 isize);
12521 }
12522 else {
12523 for (i = 0, o = 1; i < isize; i++) {
12524 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525
Victor Stinner55c08782013-04-14 18:45:39 +020012526 /* Escape quotes and backslashes */
12527 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012528 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012530 continue;
12531 }
12532
12533 /* Map special whitespace to '\t', \n', '\r' */
12534 if (ch == '\t') {
12535 PyUnicode_WRITE(okind, odata, o++, '\\');
12536 PyUnicode_WRITE(okind, odata, o++, 't');
12537 }
12538 else if (ch == '\n') {
12539 PyUnicode_WRITE(okind, odata, o++, '\\');
12540 PyUnicode_WRITE(okind, odata, o++, 'n');
12541 }
12542 else if (ch == '\r') {
12543 PyUnicode_WRITE(okind, odata, o++, '\\');
12544 PyUnicode_WRITE(okind, odata, o++, 'r');
12545 }
12546
12547 /* Map non-printable US ASCII to '\xhh' */
12548 else if (ch < ' ' || ch == 0x7F) {
12549 PyUnicode_WRITE(okind, odata, o++, '\\');
12550 PyUnicode_WRITE(okind, odata, o++, 'x');
12551 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12552 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12553 }
12554
12555 /* Copy ASCII characters as-is */
12556 else if (ch < 0x7F) {
12557 PyUnicode_WRITE(okind, odata, o++, ch);
12558 }
12559
12560 /* Non-ASCII characters */
12561 else {
12562 /* Map Unicode whitespace and control characters
12563 (categories Z* and C* except ASCII space)
12564 */
12565 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12566 PyUnicode_WRITE(okind, odata, o++, '\\');
12567 /* Map 8-bit characters to '\xhh' */
12568 if (ch <= 0xff) {
12569 PyUnicode_WRITE(okind, odata, o++, 'x');
12570 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12571 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12572 }
12573 /* Map 16-bit characters to '\uxxxx' */
12574 else if (ch <= 0xffff) {
12575 PyUnicode_WRITE(okind, odata, o++, 'u');
12576 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12577 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12578 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12579 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12580 }
12581 /* Map 21-bit characters to '\U00xxxxxx' */
12582 else {
12583 PyUnicode_WRITE(okind, odata, o++, 'U');
12584 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12585 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12586 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12587 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12588 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12589 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12590 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12591 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12592 }
12593 }
12594 /* Copy characters as-is */
12595 else {
12596 PyUnicode_WRITE(okind, odata, o++, ch);
12597 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012598 }
12599 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012602 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012603 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604}
12605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012606PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608\n\
12609Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012610such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611arguments start and end are interpreted as in slice notation.\n\
12612\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012613Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
12615static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012618 /* initialize variables to prevent gcc warning */
12619 PyObject *substring = NULL;
12620 Py_ssize_t start = 0;
12621 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012622 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012624 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012627 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012630 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 if (result == -2)
12633 return NULL;
12634
Christian Heimes217cfd12007-12-02 14:31:20 +000012635 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636}
12637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012638PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012641Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642
12643static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012646 /* initialize variables to prevent gcc warning */
12647 PyObject *substring = NULL;
12648 Py_ssize_t start = 0;
12649 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012650 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012652 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012655 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012658 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 if (result == -2)
12661 return NULL;
12662
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663 if (result < 0) {
12664 PyErr_SetString(PyExc_ValueError, "substring not found");
12665 return NULL;
12666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667
Christian Heimes217cfd12007-12-02 14:31:20 +000012668 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669}
12670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012671PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012674Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012675done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676
12677static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012678unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012680 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 Py_UCS4 fillchar = ' ';
12682
Victor Stinnere9a29352011-10-01 02:14:59 +020012683 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012685
Benjamin Petersonbac79492012-01-14 13:34:47 -050012686 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687 return NULL;
12688
Victor Stinnerc4b49542011-12-11 22:44:26 +010012689 if (PyUnicode_GET_LENGTH(self) >= width)
12690 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691
Victor Stinnerc4b49542011-12-11 22:44:26 +010012692 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693}
12694
Alexander Belopolsky40018472011-02-26 01:02:56 +000012695PyObject *
12696PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012698 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012701 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702}
12703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012704PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012705 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706\n\
12707Return a list of the words in S, using sep as the\n\
12708delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012709splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012710whitespace string is a separator and empty strings are\n\
12711removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712
12713static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012714unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012716 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012718 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012720 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12721 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 return NULL;
12723
12724 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012726
12727 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012728 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012729
12730 PyErr_Format(PyExc_TypeError,
12731 "must be str or None, not %.100s",
12732 Py_TYPE(substring)->tp_name);
12733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734}
12735
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012737PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012738{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012739 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012740 int kind1, kind2;
12741 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012744 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012746
Victor Stinner14f8f022011-10-05 20:58:25 +020012747 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 len1 = PyUnicode_GET_LENGTH(str_obj);
12750 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012751 if (kind1 < kind2 || len1 < len2) {
12752 _Py_INCREF_UNICODE_EMPTY();
12753 if (!unicode_empty)
12754 out = NULL;
12755 else {
12756 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12757 Py_DECREF(unicode_empty);
12758 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012759 return out;
12760 }
12761 buf1 = PyUnicode_DATA(str_obj);
12762 buf2 = PyUnicode_DATA(sep_obj);
12763 if (kind2 != kind1) {
12764 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12765 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012766 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012769 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012771 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12772 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12773 else
12774 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 break;
12776 case PyUnicode_2BYTE_KIND:
12777 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12778 break;
12779 case PyUnicode_4BYTE_KIND:
12780 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12781 break;
12782 default:
12783 assert(0);
12784 out = 0;
12785 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012787 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789
12790 return out;
12791}
12792
12793
12794PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012795PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012798 int kind1, kind2;
12799 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012802 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012805 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 len1 = PyUnicode_GET_LENGTH(str_obj);
12808 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012809 if (kind1 < kind2 || len1 < len2) {
12810 _Py_INCREF_UNICODE_EMPTY();
12811 if (!unicode_empty)
12812 out = NULL;
12813 else {
12814 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12815 Py_DECREF(unicode_empty);
12816 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012817 return out;
12818 }
12819 buf1 = PyUnicode_DATA(str_obj);
12820 buf2 = PyUnicode_DATA(sep_obj);
12821 if (kind2 != kind1) {
12822 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12823 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012824 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012827 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012829 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12830 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12831 else
12832 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 break;
12834 case PyUnicode_2BYTE_KIND:
12835 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12836 break;
12837 case PyUnicode_4BYTE_KIND:
12838 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12839 break;
12840 default:
12841 assert(0);
12842 out = 0;
12843 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012844
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012845 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012847
12848 return out;
12849}
12850
12851PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012853\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012854Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012855the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012856found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012857
12858static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012859unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860{
Victor Stinner9310abb2011-10-05 00:59:23 +020012861 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862}
12863
12864PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012865 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012866\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012867Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012868the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012869separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012870
12871static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012872unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012873{
Victor Stinner9310abb2011-10-05 00:59:23 +020012874 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012875}
12876
Alexander Belopolsky40018472011-02-26 01:02:56 +000012877PyObject *
12878PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012879{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012880 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012881 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012882
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012883 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012884}
12885
12886PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012887 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012888\n\
12889Return a list of the words in S, using sep as the\n\
12890delimiter string, starting at the end of the string and\n\
12891working to the front. If maxsplit is given, at most maxsplit\n\
12892splits are done. If sep is not specified, any whitespace string\n\
12893is a separator.");
12894
12895static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012896unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012897{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012898 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012899 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012900 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012901
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012902 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12903 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012904 return NULL;
12905
12906 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012908
12909 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012910 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012911
12912 PyErr_Format(PyExc_TypeError,
12913 "must be str or None, not %.100s",
12914 Py_TYPE(substring)->tp_name);
12915 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012916}
12917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012918PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920\n\
12921Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012922Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012923is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924
12925static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012926unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012928 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012929 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012931 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12932 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933 return NULL;
12934
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012935 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936}
12937
12938static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012939PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012941 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942}
12943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012944PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012945 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946\n\
12947Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012948and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949
12950static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012951unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012953 if (PyUnicode_READY(self) == -1)
12954 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012955 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956}
12957
Larry Hastings61272b72014-01-07 12:41:53 -080012958/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012959
Larry Hastings31826802013-10-19 00:09:25 -070012960@staticmethod
12961str.maketrans as unicode_maketrans
12962
12963 x: object
12964
12965 y: unicode=NULL
12966
12967 z: unicode=NULL
12968
12969 /
12970
12971Return a translation table usable for str.translate().
12972
12973If there is only one argument, it must be a dictionary mapping Unicode
12974ordinals (integers) or characters to Unicode ordinals, strings or None.
12975Character keys will be then converted to ordinals.
12976If there are two arguments, they must be strings of equal length, and
12977in the resulting dictionary, each character in x will be mapped to the
12978character at the same position in y. If there is a third argument, it
12979must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012980[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012981
Larry Hastings31826802013-10-19 00:09:25 -070012982static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012983unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012984/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012985{
Georg Brandlceee0772007-11-27 23:48:05 +000012986 PyObject *new = NULL, *key, *value;
12987 Py_ssize_t i = 0;
12988 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012989
Georg Brandlceee0772007-11-27 23:48:05 +000012990 new = PyDict_New();
12991 if (!new)
12992 return NULL;
12993 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 int x_kind, y_kind, z_kind;
12995 void *x_data, *y_data, *z_data;
12996
Georg Brandlceee0772007-11-27 23:48:05 +000012997 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012998 if (!PyUnicode_Check(x)) {
12999 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13000 "be a string if there is a second argument");
13001 goto err;
13002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013004 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13005 "arguments must have equal length");
13006 goto err;
13007 }
13008 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 x_kind = PyUnicode_KIND(x);
13010 y_kind = PyUnicode_KIND(y);
13011 x_data = PyUnicode_DATA(x);
13012 y_data = PyUnicode_DATA(y);
13013 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13014 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013015 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013016 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013017 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013018 if (!value) {
13019 Py_DECREF(key);
13020 goto err;
13021 }
Georg Brandlceee0772007-11-27 23:48:05 +000013022 res = PyDict_SetItem(new, key, value);
13023 Py_DECREF(key);
13024 Py_DECREF(value);
13025 if (res < 0)
13026 goto err;
13027 }
13028 /* create entries for deleting chars in z */
13029 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 z_kind = PyUnicode_KIND(z);
13031 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013032 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013034 if (!key)
13035 goto err;
13036 res = PyDict_SetItem(new, key, Py_None);
13037 Py_DECREF(key);
13038 if (res < 0)
13039 goto err;
13040 }
13041 }
13042 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 int kind;
13044 void *data;
13045
Georg Brandlceee0772007-11-27 23:48:05 +000013046 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013047 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013048 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13049 "to maketrans it must be a dict");
13050 goto err;
13051 }
13052 /* copy entries into the new dict, converting string keys to int keys */
13053 while (PyDict_Next(x, &i, &key, &value)) {
13054 if (PyUnicode_Check(key)) {
13055 /* convert string keys to integer keys */
13056 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013057 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013058 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13059 "table must be of length 1");
13060 goto err;
13061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 kind = PyUnicode_KIND(key);
13063 data = PyUnicode_DATA(key);
13064 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013065 if (!newkey)
13066 goto err;
13067 res = PyDict_SetItem(new, newkey, value);
13068 Py_DECREF(newkey);
13069 if (res < 0)
13070 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013071 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013072 /* just keep integer keys */
13073 if (PyDict_SetItem(new, key, value) < 0)
13074 goto err;
13075 } else {
13076 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13077 "be strings or integers");
13078 goto err;
13079 }
13080 }
13081 }
13082 return new;
13083 err:
13084 Py_DECREF(new);
13085 return NULL;
13086}
13087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013088PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013089 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013091Return a copy of the string S in which each character has been mapped\n\
13092through the given translation table. The table must implement\n\
13093lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13094mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13095this operation raises LookupError, the character is left untouched.\n\
13096Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097
13098static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102}
13103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013104PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013107Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108
13109static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013110unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013112 if (PyUnicode_READY(self) == -1)
13113 return NULL;
13114 if (PyUnicode_IS_ASCII(self))
13115 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013116 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117}
13118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013119PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013122Pad a numeric string S with zeros on the left, to fill a field\n\
13123of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
13125static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013126unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013128 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013129 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013130 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 int kind;
13132 void *data;
13133 Py_UCS4 chr;
13134
Martin v. Löwis18e16552006-02-15 17:27:45 +000013135 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 return NULL;
13137
Benjamin Petersonbac79492012-01-14 13:34:47 -050013138 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140
Victor Stinnerc4b49542011-12-11 22:44:26 +010013141 if (PyUnicode_GET_LENGTH(self) >= width)
13142 return unicode_result_unchanged(self);
13143
13144 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
13146 u = pad(self, fill, 0, '0');
13147
Walter Dörwald068325e2002-04-15 13:36:47 +000013148 if (u == NULL)
13149 return NULL;
13150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 kind = PyUnicode_KIND(u);
13152 data = PyUnicode_DATA(u);
13153 chr = PyUnicode_READ(kind, data, fill);
13154
13155 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013157 PyUnicode_WRITE(kind, data, 0, chr);
13158 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159 }
13160
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013161 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013162 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164
13165#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013166static PyObject *
13167unicode__decimal2ascii(PyObject *self)
13168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013170}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171#endif
13172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013173PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013176Return True if S starts with the specified prefix, False otherwise.\n\
13177With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013178With optional end, stop comparing S at that position.\n\
13179prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180
13181static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013182unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013185 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013186 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013187 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013188 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013189 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190
Jesus Ceaac451502011-04-20 17:09:23 +020013191 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013193 if (PyTuple_Check(subobj)) {
13194 Py_ssize_t i;
13195 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013196 substring = PyTuple_GET_ITEM(subobj, i);
13197 if (!PyUnicode_Check(substring)) {
13198 PyErr_Format(PyExc_TypeError,
13199 "tuple for startswith must only contain str, "
13200 "not %.100s",
13201 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013203 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013204 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013205 if (result == -1)
13206 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013207 if (result) {
13208 Py_RETURN_TRUE;
13209 }
13210 }
13211 /* nothing matched */
13212 Py_RETURN_FALSE;
13213 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013214 if (!PyUnicode_Check(subobj)) {
13215 PyErr_Format(PyExc_TypeError,
13216 "startswith first arg must be str or "
13217 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013219 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013220 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013221 if (result == -1)
13222 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013223 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224}
13225
13226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013227PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013228 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013230Return True if S ends with the specified suffix, False otherwise.\n\
13231With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013232With optional end, stop comparing S at that position.\n\
13233suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234
13235static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013236unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013239 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013240 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013241 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013242 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013243 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244
Jesus Ceaac451502011-04-20 17:09:23 +020013245 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013247 if (PyTuple_Check(subobj)) {
13248 Py_ssize_t i;
13249 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013250 substring = PyTuple_GET_ITEM(subobj, i);
13251 if (!PyUnicode_Check(substring)) {
13252 PyErr_Format(PyExc_TypeError,
13253 "tuple for endswith must only contain str, "
13254 "not %.100s",
13255 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013257 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013258 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013259 if (result == -1)
13260 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013261 if (result) {
13262 Py_RETURN_TRUE;
13263 }
13264 }
13265 Py_RETURN_FALSE;
13266 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013267 if (!PyUnicode_Check(subobj)) {
13268 PyErr_Format(PyExc_TypeError,
13269 "endswith first arg must be str or "
13270 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013272 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013273 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013274 if (result == -1)
13275 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013276 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277}
13278
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013279static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013280_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013281{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013282 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13283 writer->data = PyUnicode_DATA(writer->buffer);
13284
13285 if (!writer->readonly) {
13286 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013287 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013288 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013289 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013290 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13291 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13292 writer->kind = PyUnicode_WCHAR_KIND;
13293 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13294
Victor Stinner8f674cc2013-04-17 23:02:17 +020013295 /* Copy-on-write mode: set buffer size to 0 so
13296 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13297 * next write. */
13298 writer->size = 0;
13299 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013300}
13301
Victor Stinnerd3f08822012-05-29 12:57:52 +020013302void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013303_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013304{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013305 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013306
13307 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013308 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013309
13310 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13311 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13312 writer->kind = PyUnicode_WCHAR_KIND;
13313 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013314}
13315
Victor Stinnerd3f08822012-05-29 12:57:52 +020013316int
13317_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13318 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013319{
13320 Py_ssize_t newlen;
13321 PyObject *newbuffer;
13322
Victor Stinner2740e462016-09-06 16:58:36 -070013323 assert(maxchar <= MAX_UNICODE);
13324
Victor Stinnerca9381e2015-09-22 00:58:32 +020013325 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013326 assert((maxchar > writer->maxchar && length >= 0)
13327 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013328
Victor Stinner202fdca2012-05-07 12:47:02 +020013329 if (length > PY_SSIZE_T_MAX - writer->pos) {
13330 PyErr_NoMemory();
13331 return -1;
13332 }
13333 newlen = writer->pos + length;
13334
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013335 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013336
Victor Stinnerd3f08822012-05-29 12:57:52 +020013337 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013338 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013339 if (writer->overallocate
13340 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13341 /* overallocate to limit the number of realloc() */
13342 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013343 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013344 if (newlen < writer->min_length)
13345 newlen = writer->min_length;
13346
Victor Stinnerd3f08822012-05-29 12:57:52 +020013347 writer->buffer = PyUnicode_New(newlen, maxchar);
13348 if (writer->buffer == NULL)
13349 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013350 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013351 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013352 if (writer->overallocate
13353 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13354 /* overallocate to limit the number of realloc() */
13355 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013356 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013357 if (newlen < writer->min_length)
13358 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013359
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013360 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013361 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013362 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013363 newbuffer = PyUnicode_New(newlen, maxchar);
13364 if (newbuffer == NULL)
13365 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13367 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013368 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013369 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013370 }
13371 else {
13372 newbuffer = resize_compact(writer->buffer, newlen);
13373 if (newbuffer == NULL)
13374 return -1;
13375 }
13376 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013377 }
13378 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013379 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013380 newbuffer = PyUnicode_New(writer->size, maxchar);
13381 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013382 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013383 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13384 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013385 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013386 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013387 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013388 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013389
13390#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013391}
13392
Victor Stinnerca9381e2015-09-22 00:58:32 +020013393int
13394_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13395 enum PyUnicode_Kind kind)
13396{
13397 Py_UCS4 maxchar;
13398
13399 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13400 assert(writer->kind < kind);
13401
13402 switch (kind)
13403 {
13404 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13405 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13406 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13407 default:
13408 assert(0 && "invalid kind");
13409 return -1;
13410 }
13411
13412 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13413}
13414
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013415static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013416_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013417{
Victor Stinner2740e462016-09-06 16:58:36 -070013418 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013419 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13420 return -1;
13421 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13422 writer->pos++;
13423 return 0;
13424}
13425
13426int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013427_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13428{
13429 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13430}
13431
13432int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013433_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13434{
13435 Py_UCS4 maxchar;
13436 Py_ssize_t len;
13437
13438 if (PyUnicode_READY(str) == -1)
13439 return -1;
13440 len = PyUnicode_GET_LENGTH(str);
13441 if (len == 0)
13442 return 0;
13443 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13444 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013445 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013446 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013447 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013448 Py_INCREF(str);
13449 writer->buffer = str;
13450 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013451 writer->pos += len;
13452 return 0;
13453 }
13454 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13455 return -1;
13456 }
13457 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13458 str, 0, len);
13459 writer->pos += len;
13460 return 0;
13461}
13462
Victor Stinnere215d962012-10-06 23:03:36 +020013463int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013464_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13465 Py_ssize_t start, Py_ssize_t end)
13466{
13467 Py_UCS4 maxchar;
13468 Py_ssize_t len;
13469
13470 if (PyUnicode_READY(str) == -1)
13471 return -1;
13472
13473 assert(0 <= start);
13474 assert(end <= PyUnicode_GET_LENGTH(str));
13475 assert(start <= end);
13476
13477 if (end == 0)
13478 return 0;
13479
13480 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13481 return _PyUnicodeWriter_WriteStr(writer, str);
13482
13483 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13484 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13485 else
13486 maxchar = writer->maxchar;
13487 len = end - start;
13488
13489 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13490 return -1;
13491
13492 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13493 str, start, len);
13494 writer->pos += len;
13495 return 0;
13496}
13497
13498int
Victor Stinner4a587072013-11-19 12:54:53 +010013499_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13500 const char *ascii, Py_ssize_t len)
13501{
13502 if (len == -1)
13503 len = strlen(ascii);
13504
13505 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13506
13507 if (writer->buffer == NULL && !writer->overallocate) {
13508 PyObject *str;
13509
13510 str = _PyUnicode_FromASCII(ascii, len);
13511 if (str == NULL)
13512 return -1;
13513
13514 writer->readonly = 1;
13515 writer->buffer = str;
13516 _PyUnicodeWriter_Update(writer);
13517 writer->pos += len;
13518 return 0;
13519 }
13520
13521 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13522 return -1;
13523
13524 switch (writer->kind)
13525 {
13526 case PyUnicode_1BYTE_KIND:
13527 {
13528 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13529 Py_UCS1 *data = writer->data;
13530
13531 Py_MEMCPY(data + writer->pos, str, len);
13532 break;
13533 }
13534 case PyUnicode_2BYTE_KIND:
13535 {
13536 _PyUnicode_CONVERT_BYTES(
13537 Py_UCS1, Py_UCS2,
13538 ascii, ascii + len,
13539 (Py_UCS2 *)writer->data + writer->pos);
13540 break;
13541 }
13542 case PyUnicode_4BYTE_KIND:
13543 {
13544 _PyUnicode_CONVERT_BYTES(
13545 Py_UCS1, Py_UCS4,
13546 ascii, ascii + len,
13547 (Py_UCS4 *)writer->data + writer->pos);
13548 break;
13549 }
13550 default:
13551 assert(0);
13552 }
13553
13554 writer->pos += len;
13555 return 0;
13556}
13557
13558int
13559_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13560 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013561{
13562 Py_UCS4 maxchar;
13563
13564 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13565 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13566 return -1;
13567 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13568 writer->pos += len;
13569 return 0;
13570}
13571
Victor Stinnerd3f08822012-05-29 12:57:52 +020013572PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013573_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013574{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013575 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013576 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013577 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013578 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013580 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013581 str = writer->buffer;
13582 writer->buffer = NULL;
13583 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13584 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013585 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013586 if (writer->pos == 0) {
13587 Py_CLEAR(writer->buffer);
13588
13589 /* Get the empty Unicode string singleton ('') */
13590 _Py_INCREF_UNICODE_EMPTY();
13591 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013592 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013593 else {
13594 str = writer->buffer;
13595 writer->buffer = NULL;
13596
13597 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13598 PyObject *str2;
13599 str2 = resize_compact(str, writer->pos);
13600 if (str2 == NULL)
13601 return NULL;
13602 str = str2;
13603 }
13604 }
13605
Victor Stinner15a0bd32013-07-08 22:29:55 +020013606 assert(_PyUnicode_CheckConsistency(str, 1));
13607 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013608}
13609
Victor Stinnerd3f08822012-05-29 12:57:52 +020013610void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013611_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013612{
13613 Py_CLEAR(writer->buffer);
13614}
13615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013616#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013617
13618PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013619 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013620\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013621Return a formatted version of S, using substitutions from args and kwargs.\n\
13622The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013623
Eric Smith27bbca62010-11-04 17:06:58 +000013624PyDoc_STRVAR(format_map__doc__,
13625 "S.format_map(mapping) -> str\n\
13626\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013627Return a formatted version of S, using substitutions from mapping.\n\
13628The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013629
Eric Smith4a7d76d2008-05-30 18:10:19 +000013630static PyObject *
13631unicode__format__(PyObject* self, PyObject* args)
13632{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013633 PyObject *format_spec;
13634 _PyUnicodeWriter writer;
13635 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013636
13637 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13638 return NULL;
13639
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640 if (PyUnicode_READY(self) == -1)
13641 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013642 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013643 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13644 self, format_spec, 0,
13645 PyUnicode_GET_LENGTH(format_spec));
13646 if (ret == -1) {
13647 _PyUnicodeWriter_Dealloc(&writer);
13648 return NULL;
13649 }
13650 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013651}
13652
Eric Smith8c663262007-08-25 02:26:07 +000013653PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013654 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013655\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013656Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013657
13658static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013659unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013661 Py_ssize_t size;
13662
13663 /* If it's a compact object, account for base structure +
13664 character data. */
13665 if (PyUnicode_IS_COMPACT_ASCII(v))
13666 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13667 else if (PyUnicode_IS_COMPACT(v))
13668 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013669 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 else {
13671 /* If it is a two-block object, account for base object, and
13672 for character block if present. */
13673 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013674 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013676 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013677 }
13678 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013679 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013680 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013681 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013682 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013683 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684
13685 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013686}
13687
13688PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013690
13691static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013692unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013693{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013694 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013695 if (!copy)
13696 return NULL;
13697 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013698}
13699
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013701 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013702 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013703 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13704 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013705 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13706 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013707 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013708 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13709 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13710 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013711 {"expandtabs", (PyCFunction) unicode_expandtabs,
13712 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013713 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013714 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013715 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13716 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13717 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013718 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013719 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13720 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13721 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013722 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013723 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013724 {"splitlines", (PyCFunction) unicode_splitlines,
13725 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013726 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013727 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13728 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13729 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13730 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13731 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13732 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13733 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13734 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13735 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13736 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13737 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13738 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13739 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13740 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013741 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013742 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013743 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013744 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013745 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013746 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013747 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013748 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013749#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013750 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013751 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013752#endif
13753
Benjamin Peterson14339b62009-01-31 16:36:08 +000013754 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013755 {NULL, NULL}
13756};
13757
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013758static PyObject *
13759unicode_mod(PyObject *v, PyObject *w)
13760{
Brian Curtindfc80e32011-08-10 20:28:54 -050013761 if (!PyUnicode_Check(v))
13762 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013763 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013764}
13765
13766static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013767 0, /*nb_add*/
13768 0, /*nb_subtract*/
13769 0, /*nb_multiply*/
13770 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013771};
13772
Guido van Rossumd57fd912000-03-10 22:53:23 +000013773static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013774 (lenfunc) unicode_length, /* sq_length */
13775 PyUnicode_Concat, /* sq_concat */
13776 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13777 (ssizeargfunc) unicode_getitem, /* sq_item */
13778 0, /* sq_slice */
13779 0, /* sq_ass_item */
13780 0, /* sq_ass_slice */
13781 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782};
13783
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013784static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013785unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 if (PyUnicode_READY(self) == -1)
13788 return NULL;
13789
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013790 if (PyIndex_Check(item)) {
13791 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013792 if (i == -1 && PyErr_Occurred())
13793 return NULL;
13794 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013795 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013796 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013797 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013798 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013799 PyObject *result;
13800 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013801 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013802 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013806 return NULL;
13807 }
13808
13809 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013810 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013811 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013812 slicelength == PyUnicode_GET_LENGTH(self)) {
13813 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013814 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013815 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013816 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013817 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013818 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013819 src_kind = PyUnicode_KIND(self);
13820 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013821 if (!PyUnicode_IS_ASCII(self)) {
13822 kind_limit = kind_maxchar_limit(src_kind);
13823 max_char = 0;
13824 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13825 ch = PyUnicode_READ(src_kind, src_data, cur);
13826 if (ch > max_char) {
13827 max_char = ch;
13828 if (max_char >= kind_limit)
13829 break;
13830 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013831 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013832 }
Victor Stinner55c99112011-10-13 01:17:06 +020013833 else
13834 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013835 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013836 if (result == NULL)
13837 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013838 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013839 dest_data = PyUnicode_DATA(result);
13840
13841 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013842 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13843 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013844 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013845 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013846 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013847 } else {
13848 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13849 return NULL;
13850 }
13851}
13852
13853static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013854 (lenfunc)unicode_length, /* mp_length */
13855 (binaryfunc)unicode_subscript, /* mp_subscript */
13856 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013857};
13858
Guido van Rossumd57fd912000-03-10 22:53:23 +000013859
Guido van Rossumd57fd912000-03-10 22:53:23 +000013860/* Helpers for PyUnicode_Format() */
13861
Victor Stinnera47082312012-10-04 02:19:54 +020013862struct unicode_formatter_t {
13863 PyObject *args;
13864 int args_owned;
13865 Py_ssize_t arglen, argidx;
13866 PyObject *dict;
13867
13868 enum PyUnicode_Kind fmtkind;
13869 Py_ssize_t fmtcnt, fmtpos;
13870 void *fmtdata;
13871 PyObject *fmtstr;
13872
13873 _PyUnicodeWriter writer;
13874};
13875
13876struct unicode_format_arg_t {
13877 Py_UCS4 ch;
13878 int flags;
13879 Py_ssize_t width;
13880 int prec;
13881 int sign;
13882};
13883
Guido van Rossumd57fd912000-03-10 22:53:23 +000013884static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013885unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013886{
Victor Stinnera47082312012-10-04 02:19:54 +020013887 Py_ssize_t argidx = ctx->argidx;
13888
13889 if (argidx < ctx->arglen) {
13890 ctx->argidx++;
13891 if (ctx->arglen < 0)
13892 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013893 else
Victor Stinnera47082312012-10-04 02:19:54 +020013894 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895 }
13896 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013897 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013898 return NULL;
13899}
13900
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013901/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013902
Victor Stinnera47082312012-10-04 02:19:54 +020013903/* Format a float into the writer if the writer is not NULL, or into *p_output
13904 otherwise.
13905
13906 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013907static int
Victor Stinnera47082312012-10-04 02:19:54 +020013908formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13909 PyObject **p_output,
13910 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013911{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013912 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013913 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013914 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013915 int prec;
13916 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013917
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918 x = PyFloat_AsDouble(v);
13919 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013920 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013921
Victor Stinnera47082312012-10-04 02:19:54 +020013922 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013923 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013924 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013925
Victor Stinnera47082312012-10-04 02:19:54 +020013926 if (arg->flags & F_ALT)
13927 dtoa_flags = Py_DTSF_ALT;
13928 else
13929 dtoa_flags = 0;
13930 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013931 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013932 return -1;
13933 len = strlen(p);
13934 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013935 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013936 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013937 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013938 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013939 }
13940 else
13941 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013942 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013943 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944}
13945
Victor Stinnerd0880d52012-04-27 23:40:13 +020013946/* formatlong() emulates the format codes d, u, o, x and X, and
13947 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13948 * Python's regular ints.
13949 * Return value: a new PyUnicodeObject*, or NULL if error.
13950 * The output string is of the form
13951 * "-"? ("0x" | "0X")? digit+
13952 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13953 * set in flags. The case of hex digits will be correct,
13954 * There will be at least prec digits, zero-filled on the left if
13955 * necessary to get that many.
13956 * val object to be converted
13957 * flags bitmask of format flags; only F_ALT is looked at
13958 * prec minimum number of digits; 0-fill on left if needed
13959 * type a character in [duoxX]; u acts the same as d
13960 *
13961 * CAUTION: o, x and X conversions on regular ints can never
13962 * produce a '-' sign, but can for Python's unbounded ints.
13963 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013964PyObject *
13965_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013966{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013967 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013969 Py_ssize_t i;
13970 int sign; /* 1 if '-', else 0 */
13971 int len; /* number of characters */
13972 Py_ssize_t llen;
13973 int numdigits; /* len == numnondigits + numdigits */
13974 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013975
Victor Stinnerd0880d52012-04-27 23:40:13 +020013976 /* Avoid exceeding SSIZE_T_MAX */
13977 if (prec > INT_MAX-3) {
13978 PyErr_SetString(PyExc_OverflowError,
13979 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013981 }
13982
13983 assert(PyLong_Check(val));
13984
13985 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013986 default:
13987 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013988 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013989 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013990 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013991 /* int and int subclasses should print numerically when a numeric */
13992 /* format code is used (see issue18780) */
13993 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013994 break;
13995 case 'o':
13996 numnondigits = 2;
13997 result = PyNumber_ToBase(val, 8);
13998 break;
13999 case 'x':
14000 case 'X':
14001 numnondigits = 2;
14002 result = PyNumber_ToBase(val, 16);
14003 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014004 }
14005 if (!result)
14006 return NULL;
14007
14008 assert(unicode_modifiable(result));
14009 assert(PyUnicode_IS_READY(result));
14010 assert(PyUnicode_IS_ASCII(result));
14011
14012 /* To modify the string in-place, there can only be one reference. */
14013 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014014 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014015 PyErr_BadInternalCall();
14016 return NULL;
14017 }
14018 buf = PyUnicode_DATA(result);
14019 llen = PyUnicode_GET_LENGTH(result);
14020 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014021 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014022 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014023 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014024 return NULL;
14025 }
14026 len = (int)llen;
14027 sign = buf[0] == '-';
14028 numnondigits += sign;
14029 numdigits = len - numnondigits;
14030 assert(numdigits > 0);
14031
14032 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014033 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014034 (type == 'o' || type == 'x' || type == 'X'))) {
14035 assert(buf[sign] == '0');
14036 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14037 buf[sign+1] == 'o');
14038 numnondigits -= 2;
14039 buf += 2;
14040 len -= 2;
14041 if (sign)
14042 buf[0] = '-';
14043 assert(len == numnondigits + numdigits);
14044 assert(numdigits > 0);
14045 }
14046
14047 /* Fill with leading zeroes to meet minimum width. */
14048 if (prec > numdigits) {
14049 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14050 numnondigits + prec);
14051 char *b1;
14052 if (!r1) {
14053 Py_DECREF(result);
14054 return NULL;
14055 }
14056 b1 = PyBytes_AS_STRING(r1);
14057 for (i = 0; i < numnondigits; ++i)
14058 *b1++ = *buf++;
14059 for (i = 0; i < prec - numdigits; i++)
14060 *b1++ = '0';
14061 for (i = 0; i < numdigits; i++)
14062 *b1++ = *buf++;
14063 *b1 = '\0';
14064 Py_DECREF(result);
14065 result = r1;
14066 buf = PyBytes_AS_STRING(result);
14067 len = numnondigits + prec;
14068 }
14069
14070 /* Fix up case for hex conversions. */
14071 if (type == 'X') {
14072 /* Need to convert all lower case letters to upper case.
14073 and need to convert 0x to 0X (and -0x to -0X). */
14074 for (i = 0; i < len; i++)
14075 if (buf[i] >= 'a' && buf[i] <= 'x')
14076 buf[i] -= 'a'-'A';
14077 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014078 if (!PyUnicode_Check(result)
14079 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014080 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014081 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014082 Py_DECREF(result);
14083 result = unicode;
14084 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014085 else if (len != PyUnicode_GET_LENGTH(result)) {
14086 if (PyUnicode_Resize(&result, len) < 0)
14087 Py_CLEAR(result);
14088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014089 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014090}
14091
Ethan Furmandf3ed242014-01-05 06:50:30 -080014092/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014093 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014094 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014095 * -1 and raise an exception on error */
14096static int
Victor Stinnera47082312012-10-04 02:19:54 +020014097mainformatlong(PyObject *v,
14098 struct unicode_format_arg_t *arg,
14099 PyObject **p_output,
14100 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014101{
14102 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014103 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014104
14105 if (!PyNumber_Check(v))
14106 goto wrongtype;
14107
Ethan Furman9ab74802014-03-21 06:38:46 -070014108 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014109 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014110 if (type == 'o' || type == 'x' || type == 'X') {
14111 iobj = PyNumber_Index(v);
14112 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014113 if (PyErr_ExceptionMatches(PyExc_TypeError))
14114 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014115 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014116 }
14117 }
14118 else {
14119 iobj = PyNumber_Long(v);
14120 if (iobj == NULL ) {
14121 if (PyErr_ExceptionMatches(PyExc_TypeError))
14122 goto wrongtype;
14123 return -1;
14124 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014125 }
14126 assert(PyLong_Check(iobj));
14127 }
14128 else {
14129 iobj = v;
14130 Py_INCREF(iobj);
14131 }
14132
14133 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014134 && arg->width == -1 && arg->prec == -1
14135 && !(arg->flags & (F_SIGN | F_BLANK))
14136 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014137 {
14138 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014139 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014140 int base;
14141
Victor Stinnera47082312012-10-04 02:19:54 +020014142 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014143 {
14144 default:
14145 assert(0 && "'type' not in [diuoxX]");
14146 case 'd':
14147 case 'i':
14148 case 'u':
14149 base = 10;
14150 break;
14151 case 'o':
14152 base = 8;
14153 break;
14154 case 'x':
14155 case 'X':
14156 base = 16;
14157 break;
14158 }
14159
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014160 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14161 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014162 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014163 }
14164 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014165 return 1;
14166 }
14167
Ethan Furmanb95b5612015-01-23 20:05:18 -080014168 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014169 Py_DECREF(iobj);
14170 if (res == NULL)
14171 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014172 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014173 return 0;
14174
14175wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014176 switch(type)
14177 {
14178 case 'o':
14179 case 'x':
14180 case 'X':
14181 PyErr_Format(PyExc_TypeError,
14182 "%%%c format: an integer is required, "
14183 "not %.200s",
14184 type, Py_TYPE(v)->tp_name);
14185 break;
14186 default:
14187 PyErr_Format(PyExc_TypeError,
14188 "%%%c format: a number is required, "
14189 "not %.200s",
14190 type, Py_TYPE(v)->tp_name);
14191 break;
14192 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014193 return -1;
14194}
14195
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014196static Py_UCS4
14197formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014198{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014199 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014200 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014201 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014202 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014203 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014204 goto onError;
14205 }
14206 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014207 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014208 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014209 /* make sure number is a type of integer */
14210 if (!PyLong_Check(v)) {
14211 iobj = PyNumber_Index(v);
14212 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014213 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014214 }
14215 v = iobj;
14216 Py_DECREF(iobj);
14217 }
14218 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014219 x = PyLong_AsLong(v);
14220 if (x == -1 && PyErr_Occurred())
14221 goto onError;
14222
Victor Stinner8faf8212011-12-08 22:14:11 +010014223 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014224 PyErr_SetString(PyExc_OverflowError,
14225 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014226 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014227 }
14228
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014229 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014230 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014231
Benjamin Peterson29060642009-01-31 22:14:21 +000014232 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014233 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014234 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014235 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014236}
14237
Victor Stinnera47082312012-10-04 02:19:54 +020014238/* Parse options of an argument: flags, width, precision.
14239 Handle also "%(name)" syntax.
14240
14241 Return 0 if the argument has been formatted into arg->str.
14242 Return 1 if the argument has been written into ctx->writer,
14243 Raise an exception and return -1 on error. */
14244static int
14245unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14246 struct unicode_format_arg_t *arg)
14247{
14248#define FORMAT_READ(ctx) \
14249 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14250
14251 PyObject *v;
14252
Victor Stinnera47082312012-10-04 02:19:54 +020014253 if (arg->ch == '(') {
14254 /* Get argument value from a dictionary. Example: "%(name)s". */
14255 Py_ssize_t keystart;
14256 Py_ssize_t keylen;
14257 PyObject *key;
14258 int pcount = 1;
14259
14260 if (ctx->dict == NULL) {
14261 PyErr_SetString(PyExc_TypeError,
14262 "format requires a mapping");
14263 return -1;
14264 }
14265 ++ctx->fmtpos;
14266 --ctx->fmtcnt;
14267 keystart = ctx->fmtpos;
14268 /* Skip over balanced parentheses */
14269 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14270 arg->ch = FORMAT_READ(ctx);
14271 if (arg->ch == ')')
14272 --pcount;
14273 else if (arg->ch == '(')
14274 ++pcount;
14275 ctx->fmtpos++;
14276 }
14277 keylen = ctx->fmtpos - keystart - 1;
14278 if (ctx->fmtcnt < 0 || pcount > 0) {
14279 PyErr_SetString(PyExc_ValueError,
14280 "incomplete format key");
14281 return -1;
14282 }
14283 key = PyUnicode_Substring(ctx->fmtstr,
14284 keystart, keystart + keylen);
14285 if (key == NULL)
14286 return -1;
14287 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014288 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014289 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014290 }
14291 ctx->args = PyObject_GetItem(ctx->dict, key);
14292 Py_DECREF(key);
14293 if (ctx->args == NULL)
14294 return -1;
14295 ctx->args_owned = 1;
14296 ctx->arglen = -1;
14297 ctx->argidx = -2;
14298 }
14299
14300 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014301 while (--ctx->fmtcnt >= 0) {
14302 arg->ch = FORMAT_READ(ctx);
14303 ctx->fmtpos++;
14304 switch (arg->ch) {
14305 case '-': arg->flags |= F_LJUST; continue;
14306 case '+': arg->flags |= F_SIGN; continue;
14307 case ' ': arg->flags |= F_BLANK; continue;
14308 case '#': arg->flags |= F_ALT; continue;
14309 case '0': arg->flags |= F_ZERO; continue;
14310 }
14311 break;
14312 }
14313
14314 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014315 if (arg->ch == '*') {
14316 v = unicode_format_getnextarg(ctx);
14317 if (v == NULL)
14318 return -1;
14319 if (!PyLong_Check(v)) {
14320 PyErr_SetString(PyExc_TypeError,
14321 "* wants int");
14322 return -1;
14323 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014324 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014325 if (arg->width == -1 && PyErr_Occurred())
14326 return -1;
14327 if (arg->width < 0) {
14328 arg->flags |= F_LJUST;
14329 arg->width = -arg->width;
14330 }
14331 if (--ctx->fmtcnt >= 0) {
14332 arg->ch = FORMAT_READ(ctx);
14333 ctx->fmtpos++;
14334 }
14335 }
14336 else if (arg->ch >= '0' && arg->ch <= '9') {
14337 arg->width = arg->ch - '0';
14338 while (--ctx->fmtcnt >= 0) {
14339 arg->ch = FORMAT_READ(ctx);
14340 ctx->fmtpos++;
14341 if (arg->ch < '0' || arg->ch > '9')
14342 break;
14343 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14344 mixing signed and unsigned comparison. Since arg->ch is between
14345 '0' and '9', casting to int is safe. */
14346 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14347 PyErr_SetString(PyExc_ValueError,
14348 "width too big");
14349 return -1;
14350 }
14351 arg->width = arg->width*10 + (arg->ch - '0');
14352 }
14353 }
14354
14355 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014356 if (arg->ch == '.') {
14357 arg->prec = 0;
14358 if (--ctx->fmtcnt >= 0) {
14359 arg->ch = FORMAT_READ(ctx);
14360 ctx->fmtpos++;
14361 }
14362 if (arg->ch == '*') {
14363 v = unicode_format_getnextarg(ctx);
14364 if (v == NULL)
14365 return -1;
14366 if (!PyLong_Check(v)) {
14367 PyErr_SetString(PyExc_TypeError,
14368 "* wants int");
14369 return -1;
14370 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014371 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014372 if (arg->prec == -1 && PyErr_Occurred())
14373 return -1;
14374 if (arg->prec < 0)
14375 arg->prec = 0;
14376 if (--ctx->fmtcnt >= 0) {
14377 arg->ch = FORMAT_READ(ctx);
14378 ctx->fmtpos++;
14379 }
14380 }
14381 else if (arg->ch >= '0' && arg->ch <= '9') {
14382 arg->prec = arg->ch - '0';
14383 while (--ctx->fmtcnt >= 0) {
14384 arg->ch = FORMAT_READ(ctx);
14385 ctx->fmtpos++;
14386 if (arg->ch < '0' || arg->ch > '9')
14387 break;
14388 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14389 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014390 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014391 return -1;
14392 }
14393 arg->prec = arg->prec*10 + (arg->ch - '0');
14394 }
14395 }
14396 }
14397
14398 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14399 if (ctx->fmtcnt >= 0) {
14400 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14401 if (--ctx->fmtcnt >= 0) {
14402 arg->ch = FORMAT_READ(ctx);
14403 ctx->fmtpos++;
14404 }
14405 }
14406 }
14407 if (ctx->fmtcnt < 0) {
14408 PyErr_SetString(PyExc_ValueError,
14409 "incomplete format");
14410 return -1;
14411 }
14412 return 0;
14413
14414#undef FORMAT_READ
14415}
14416
14417/* Format one argument. Supported conversion specifiers:
14418
14419 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014420 - "i", "d", "u": int or float
14421 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014422 - "e", "E", "f", "F", "g", "G": float
14423 - "c": int or str (1 character)
14424
Victor Stinner8dbd4212012-12-04 09:30:24 +010014425 When possible, the output is written directly into the Unicode writer
14426 (ctx->writer). A string is created when padding is required.
14427
Victor Stinnera47082312012-10-04 02:19:54 +020014428 Return 0 if the argument has been formatted into *p_str,
14429 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014430 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014431static int
14432unicode_format_arg_format(struct unicode_formatter_t *ctx,
14433 struct unicode_format_arg_t *arg,
14434 PyObject **p_str)
14435{
14436 PyObject *v;
14437 _PyUnicodeWriter *writer = &ctx->writer;
14438
14439 if (ctx->fmtcnt == 0)
14440 ctx->writer.overallocate = 0;
14441
14442 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014443 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014444 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014445 return 1;
14446 }
14447
14448 v = unicode_format_getnextarg(ctx);
14449 if (v == NULL)
14450 return -1;
14451
Victor Stinnera47082312012-10-04 02:19:54 +020014452
14453 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014454 case 's':
14455 case 'r':
14456 case 'a':
14457 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14458 /* Fast path */
14459 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14460 return -1;
14461 return 1;
14462 }
14463
14464 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14465 *p_str = v;
14466 Py_INCREF(*p_str);
14467 }
14468 else {
14469 if (arg->ch == 's')
14470 *p_str = PyObject_Str(v);
14471 else if (arg->ch == 'r')
14472 *p_str = PyObject_Repr(v);
14473 else
14474 *p_str = PyObject_ASCII(v);
14475 }
14476 break;
14477
14478 case 'i':
14479 case 'd':
14480 case 'u':
14481 case 'o':
14482 case 'x':
14483 case 'X':
14484 {
14485 int ret = mainformatlong(v, arg, p_str, writer);
14486 if (ret != 0)
14487 return ret;
14488 arg->sign = 1;
14489 break;
14490 }
14491
14492 case 'e':
14493 case 'E':
14494 case 'f':
14495 case 'F':
14496 case 'g':
14497 case 'G':
14498 if (arg->width == -1 && arg->prec == -1
14499 && !(arg->flags & (F_SIGN | F_BLANK)))
14500 {
14501 /* Fast path */
14502 if (formatfloat(v, arg, NULL, writer) == -1)
14503 return -1;
14504 return 1;
14505 }
14506
14507 arg->sign = 1;
14508 if (formatfloat(v, arg, p_str, NULL) == -1)
14509 return -1;
14510 break;
14511
14512 case 'c':
14513 {
14514 Py_UCS4 ch = formatchar(v);
14515 if (ch == (Py_UCS4) -1)
14516 return -1;
14517 if (arg->width == -1 && arg->prec == -1) {
14518 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014519 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014520 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014521 return 1;
14522 }
14523 *p_str = PyUnicode_FromOrdinal(ch);
14524 break;
14525 }
14526
14527 default:
14528 PyErr_Format(PyExc_ValueError,
14529 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014530 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014531 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14532 (int)arg->ch,
14533 ctx->fmtpos - 1);
14534 return -1;
14535 }
14536 if (*p_str == NULL)
14537 return -1;
14538 assert (PyUnicode_Check(*p_str));
14539 return 0;
14540}
14541
14542static int
14543unicode_format_arg_output(struct unicode_formatter_t *ctx,
14544 struct unicode_format_arg_t *arg,
14545 PyObject *str)
14546{
14547 Py_ssize_t len;
14548 enum PyUnicode_Kind kind;
14549 void *pbuf;
14550 Py_ssize_t pindex;
14551 Py_UCS4 signchar;
14552 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014553 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014554 Py_ssize_t sublen;
14555 _PyUnicodeWriter *writer = &ctx->writer;
14556 Py_UCS4 fill;
14557
14558 fill = ' ';
14559 if (arg->sign && arg->flags & F_ZERO)
14560 fill = '0';
14561
14562 if (PyUnicode_READY(str) == -1)
14563 return -1;
14564
14565 len = PyUnicode_GET_LENGTH(str);
14566 if ((arg->width == -1 || arg->width <= len)
14567 && (arg->prec == -1 || arg->prec >= len)
14568 && !(arg->flags & (F_SIGN | F_BLANK)))
14569 {
14570 /* Fast path */
14571 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14572 return -1;
14573 return 0;
14574 }
14575
14576 /* Truncate the string for "s", "r" and "a" formats
14577 if the precision is set */
14578 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14579 if (arg->prec >= 0 && len > arg->prec)
14580 len = arg->prec;
14581 }
14582
14583 /* Adjust sign and width */
14584 kind = PyUnicode_KIND(str);
14585 pbuf = PyUnicode_DATA(str);
14586 pindex = 0;
14587 signchar = '\0';
14588 if (arg->sign) {
14589 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14590 if (ch == '-' || ch == '+') {
14591 signchar = ch;
14592 len--;
14593 pindex++;
14594 }
14595 else if (arg->flags & F_SIGN)
14596 signchar = '+';
14597 else if (arg->flags & F_BLANK)
14598 signchar = ' ';
14599 else
14600 arg->sign = 0;
14601 }
14602 if (arg->width < len)
14603 arg->width = len;
14604
14605 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014606 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014607 if (!(arg->flags & F_LJUST)) {
14608 if (arg->sign) {
14609 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014610 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014611 }
14612 else {
14613 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014614 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014615 }
14616 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014617 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14618 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014619 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014620 }
14621
Victor Stinnera47082312012-10-04 02:19:54 +020014622 buflen = arg->width;
14623 if (arg->sign && len == arg->width)
14624 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014625 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014626 return -1;
14627
14628 /* Write the sign if needed */
14629 if (arg->sign) {
14630 if (fill != ' ') {
14631 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14632 writer->pos += 1;
14633 }
14634 if (arg->width > len)
14635 arg->width--;
14636 }
14637
14638 /* Write the numeric prefix for "x", "X" and "o" formats
14639 if the alternate form is used.
14640 For example, write "0x" for the "%#x" format. */
14641 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14642 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14643 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14644 if (fill != ' ') {
14645 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14646 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14647 writer->pos += 2;
14648 pindex += 2;
14649 }
14650 arg->width -= 2;
14651 if (arg->width < 0)
14652 arg->width = 0;
14653 len -= 2;
14654 }
14655
14656 /* Pad left with the fill character if needed */
14657 if (arg->width > len && !(arg->flags & F_LJUST)) {
14658 sublen = arg->width - len;
14659 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14660 writer->pos += sublen;
14661 arg->width = len;
14662 }
14663
14664 /* If padding with spaces: write sign if needed and/or numeric prefix if
14665 the alternate form is used */
14666 if (fill == ' ') {
14667 if (arg->sign) {
14668 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14669 writer->pos += 1;
14670 }
14671 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14672 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14673 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14674 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14675 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14676 writer->pos += 2;
14677 pindex += 2;
14678 }
14679 }
14680
14681 /* Write characters */
14682 if (len) {
14683 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14684 str, pindex, len);
14685 writer->pos += len;
14686 }
14687
14688 /* Pad right with the fill character if needed */
14689 if (arg->width > len) {
14690 sublen = arg->width - len;
14691 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14692 writer->pos += sublen;
14693 }
14694 return 0;
14695}
14696
14697/* Helper of PyUnicode_Format(): format one arg.
14698 Return 0 on success, raise an exception and return -1 on error. */
14699static int
14700unicode_format_arg(struct unicode_formatter_t *ctx)
14701{
14702 struct unicode_format_arg_t arg;
14703 PyObject *str;
14704 int ret;
14705
Victor Stinner8dbd4212012-12-04 09:30:24 +010014706 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14707 arg.flags = 0;
14708 arg.width = -1;
14709 arg.prec = -1;
14710 arg.sign = 0;
14711 str = NULL;
14712
Victor Stinnera47082312012-10-04 02:19:54 +020014713 ret = unicode_format_arg_parse(ctx, &arg);
14714 if (ret == -1)
14715 return -1;
14716
14717 ret = unicode_format_arg_format(ctx, &arg, &str);
14718 if (ret == -1)
14719 return -1;
14720
14721 if (ret != 1) {
14722 ret = unicode_format_arg_output(ctx, &arg, str);
14723 Py_DECREF(str);
14724 if (ret == -1)
14725 return -1;
14726 }
14727
14728 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14729 PyErr_SetString(PyExc_TypeError,
14730 "not all arguments converted during string formatting");
14731 return -1;
14732 }
14733 return 0;
14734}
14735
Alexander Belopolsky40018472011-02-26 01:02:56 +000014736PyObject *
14737PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014738{
Victor Stinnera47082312012-10-04 02:19:54 +020014739 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014740
Guido van Rossumd57fd912000-03-10 22:53:23 +000014741 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014742 PyErr_BadInternalCall();
14743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014744 }
Victor Stinnera47082312012-10-04 02:19:54 +020014745
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014746 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014747 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014748
14749 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014750 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14751 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14752 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14753 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014754
Victor Stinner8f674cc2013-04-17 23:02:17 +020014755 _PyUnicodeWriter_Init(&ctx.writer);
14756 ctx.writer.min_length = ctx.fmtcnt + 100;
14757 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014758
Guido van Rossumd57fd912000-03-10 22:53:23 +000014759 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014760 ctx.arglen = PyTuple_Size(args);
14761 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014762 }
14763 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014764 ctx.arglen = -1;
14765 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014766 }
Victor Stinnera47082312012-10-04 02:19:54 +020014767 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014768 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014769 ctx.dict = args;
14770 else
14771 ctx.dict = NULL;
14772 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014773
Victor Stinnera47082312012-10-04 02:19:54 +020014774 while (--ctx.fmtcnt >= 0) {
14775 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014776 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014777
14778 nonfmtpos = ctx.fmtpos++;
14779 while (ctx.fmtcnt >= 0 &&
14780 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14781 ctx.fmtpos++;
14782 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014783 }
Victor Stinnera47082312012-10-04 02:19:54 +020014784 if (ctx.fmtcnt < 0) {
14785 ctx.fmtpos--;
14786 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014787 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014788
Victor Stinnercfc4c132013-04-03 01:48:39 +020014789 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14790 nonfmtpos, ctx.fmtpos) < 0)
14791 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014792 }
14793 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014794 ctx.fmtpos++;
14795 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014796 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014797 }
14798 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014799
Victor Stinnera47082312012-10-04 02:19:54 +020014800 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014801 PyErr_SetString(PyExc_TypeError,
14802 "not all arguments converted during string formatting");
14803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014804 }
14805
Victor Stinnera47082312012-10-04 02:19:54 +020014806 if (ctx.args_owned) {
14807 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014808 }
Victor Stinnera47082312012-10-04 02:19:54 +020014809 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014810
Benjamin Peterson29060642009-01-31 22:14:21 +000014811 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014812 _PyUnicodeWriter_Dealloc(&ctx.writer);
14813 if (ctx.args_owned) {
14814 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014815 }
14816 return NULL;
14817}
14818
Jeremy Hylton938ace62002-07-17 16:30:39 +000014819static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014820unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14821
Tim Peters6d6c1a32001-08-02 04:15:00 +000014822static PyObject *
14823unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14824{
Benjamin Peterson29060642009-01-31 22:14:21 +000014825 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014826 static char *kwlist[] = {"object", "encoding", "errors", 0};
14827 char *encoding = NULL;
14828 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014829
Benjamin Peterson14339b62009-01-31 16:36:08 +000014830 if (type != &PyUnicode_Type)
14831 return unicode_subtype_new(type, args, kwds);
14832 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014833 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014834 return NULL;
14835 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014836 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014837 if (encoding == NULL && errors == NULL)
14838 return PyObject_Str(x);
14839 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014840 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014841}
14842
Guido van Rossume023fe02001-08-30 03:12:59 +000014843static PyObject *
14844unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14845{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014846 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014847 Py_ssize_t length, char_size;
14848 int share_wstr, share_utf8;
14849 unsigned int kind;
14850 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014851
Benjamin Peterson14339b62009-01-31 16:36:08 +000014852 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014853
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014854 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014855 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014856 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014857 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014858 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014859 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014860 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014861 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014862
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014863 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014864 if (self == NULL) {
14865 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014866 return NULL;
14867 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014868 kind = PyUnicode_KIND(unicode);
14869 length = PyUnicode_GET_LENGTH(unicode);
14870
14871 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014872#ifdef Py_DEBUG
14873 _PyUnicode_HASH(self) = -1;
14874#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014875 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014876#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014877 _PyUnicode_STATE(self).interned = 0;
14878 _PyUnicode_STATE(self).kind = kind;
14879 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014880 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014881 _PyUnicode_STATE(self).ready = 1;
14882 _PyUnicode_WSTR(self) = NULL;
14883 _PyUnicode_UTF8_LENGTH(self) = 0;
14884 _PyUnicode_UTF8(self) = NULL;
14885 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014886 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014887
14888 share_utf8 = 0;
14889 share_wstr = 0;
14890 if (kind == PyUnicode_1BYTE_KIND) {
14891 char_size = 1;
14892 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14893 share_utf8 = 1;
14894 }
14895 else if (kind == PyUnicode_2BYTE_KIND) {
14896 char_size = 2;
14897 if (sizeof(wchar_t) == 2)
14898 share_wstr = 1;
14899 }
14900 else {
14901 assert(kind == PyUnicode_4BYTE_KIND);
14902 char_size = 4;
14903 if (sizeof(wchar_t) == 4)
14904 share_wstr = 1;
14905 }
14906
14907 /* Ensure we won't overflow the length. */
14908 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14909 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014910 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014911 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014912 data = PyObject_MALLOC((length + 1) * char_size);
14913 if (data == NULL) {
14914 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014915 goto onError;
14916 }
14917
Victor Stinnerc3c74152011-10-02 20:39:55 +020014918 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014919 if (share_utf8) {
14920 _PyUnicode_UTF8_LENGTH(self) = length;
14921 _PyUnicode_UTF8(self) = data;
14922 }
14923 if (share_wstr) {
14924 _PyUnicode_WSTR_LENGTH(self) = length;
14925 _PyUnicode_WSTR(self) = (wchar_t *)data;
14926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014927
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014928 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014929 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014930 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014931#ifdef Py_DEBUG
14932 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14933#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014934 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014935 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014936
14937onError:
14938 Py_DECREF(unicode);
14939 Py_DECREF(self);
14940 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014941}
14942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014943PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014944"str(object='') -> str\n\
14945str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014946\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014947Create a new string object from the given object. If encoding or\n\
14948errors is specified, then the object must expose a data buffer\n\
14949that will be decoded using the given encoding and error handler.\n\
14950Otherwise, returns the result of object.__str__() (if defined)\n\
14951or repr(object).\n\
14952encoding defaults to sys.getdefaultencoding().\n\
14953errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014954
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014955static PyObject *unicode_iter(PyObject *seq);
14956
Guido van Rossumd57fd912000-03-10 22:53:23 +000014957PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014958 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014959 "str", /* tp_name */
14960 sizeof(PyUnicodeObject), /* tp_size */
14961 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014962 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014963 (destructor)unicode_dealloc, /* tp_dealloc */
14964 0, /* tp_print */
14965 0, /* tp_getattr */
14966 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014967 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014968 unicode_repr, /* tp_repr */
14969 &unicode_as_number, /* tp_as_number */
14970 &unicode_as_sequence, /* tp_as_sequence */
14971 &unicode_as_mapping, /* tp_as_mapping */
14972 (hashfunc) unicode_hash, /* tp_hash*/
14973 0, /* tp_call*/
14974 (reprfunc) unicode_str, /* tp_str */
14975 PyObject_GenericGetAttr, /* tp_getattro */
14976 0, /* tp_setattro */
14977 0, /* tp_as_buffer */
14978 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014979 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014980 unicode_doc, /* tp_doc */
14981 0, /* tp_traverse */
14982 0, /* tp_clear */
14983 PyUnicode_RichCompare, /* tp_richcompare */
14984 0, /* tp_weaklistoffset */
14985 unicode_iter, /* tp_iter */
14986 0, /* tp_iternext */
14987 unicode_methods, /* tp_methods */
14988 0, /* tp_members */
14989 0, /* tp_getset */
14990 &PyBaseObject_Type, /* tp_base */
14991 0, /* tp_dict */
14992 0, /* tp_descr_get */
14993 0, /* tp_descr_set */
14994 0, /* tp_dictoffset */
14995 0, /* tp_init */
14996 0, /* tp_alloc */
14997 unicode_new, /* tp_new */
14998 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014999};
15000
15001/* Initialize the Unicode implementation */
15002
Victor Stinner3a50e702011-10-18 21:21:00 +020015003int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015004{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015005 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015006 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015007 0x000A, /* LINE FEED */
15008 0x000D, /* CARRIAGE RETURN */
15009 0x001C, /* FILE SEPARATOR */
15010 0x001D, /* GROUP SEPARATOR */
15011 0x001E, /* RECORD SEPARATOR */
15012 0x0085, /* NEXT LINE */
15013 0x2028, /* LINE SEPARATOR */
15014 0x2029, /* PARAGRAPH SEPARATOR */
15015 };
15016
Fred Drakee4315f52000-05-09 19:53:39 +000015017 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015018 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015019 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015020 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015021 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015022
Guido van Rossumcacfc072002-05-24 19:01:59 +000015023 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015024 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015025
15026 /* initialize the linebreak bloom filter */
15027 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015028 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015029 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015030
Christian Heimes26532f72013-07-20 14:57:16 +020015031 if (PyType_Ready(&EncodingMapType) < 0)
15032 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015033
Benjamin Petersonc4311282012-10-30 23:21:10 -040015034 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15035 Py_FatalError("Can't initialize field name iterator type");
15036
15037 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15038 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015039
Victor Stinner3a50e702011-10-18 21:21:00 +020015040 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015041}
15042
15043/* Finalize the Unicode implementation */
15044
Christian Heimesa156e092008-02-16 07:38:31 +000015045int
15046PyUnicode_ClearFreeList(void)
15047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015048 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015049}
15050
Guido van Rossumd57fd912000-03-10 22:53:23 +000015051void
Thomas Wouters78890102000-07-22 19:25:51 +000015052_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015053{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015054 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015055
Serhiy Storchaka05997252013-01-26 12:14:02 +020015056 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015057
Serhiy Storchaka05997252013-01-26 12:14:02 +020015058 for (i = 0; i < 256; i++)
15059 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015060 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015061 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015062}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015063
Walter Dörwald16807132007-05-25 13:52:07 +000015064void
15065PyUnicode_InternInPlace(PyObject **p)
15066{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015067 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015069#ifdef Py_DEBUG
15070 assert(s != NULL);
15071 assert(_PyUnicode_CHECK(s));
15072#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015073 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015074 return;
15075#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015076 /* If it's a subclass, we don't really know what putting
15077 it in the interned dict might do. */
15078 if (!PyUnicode_CheckExact(s))
15079 return;
15080 if (PyUnicode_CHECK_INTERNED(s))
15081 return;
15082 if (interned == NULL) {
15083 interned = PyDict_New();
15084 if (interned == NULL) {
15085 PyErr_Clear(); /* Don't leave an exception */
15086 return;
15087 }
15088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015089 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015090 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015092 if (t == NULL) {
15093 PyErr_Clear();
15094 return;
15095 }
15096 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015097 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015098 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015099 return;
15100 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 /* The two references in interned are not counted by refcnt.
15102 The deallocator will take care of this */
15103 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015104 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015105}
15106
15107void
15108PyUnicode_InternImmortal(PyObject **p)
15109{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015110 PyUnicode_InternInPlace(p);
15111 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015112 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 Py_INCREF(*p);
15114 }
Walter Dörwald16807132007-05-25 13:52:07 +000015115}
15116
15117PyObject *
15118PyUnicode_InternFromString(const char *cp)
15119{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015120 PyObject *s = PyUnicode_FromString(cp);
15121 if (s == NULL)
15122 return NULL;
15123 PyUnicode_InternInPlace(&s);
15124 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015125}
15126
Alexander Belopolsky40018472011-02-26 01:02:56 +000015127void
15128_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015129{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015130 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015131 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015132 Py_ssize_t i, n;
15133 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015134
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 if (interned == NULL || !PyDict_Check(interned))
15136 return;
15137 keys = PyDict_Keys(interned);
15138 if (keys == NULL || !PyList_Check(keys)) {
15139 PyErr_Clear();
15140 return;
15141 }
Walter Dörwald16807132007-05-25 13:52:07 +000015142
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15144 detector, interned unicode strings are not forcibly deallocated;
15145 rather, we give them their stolen references back, and then clear
15146 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015147
Benjamin Peterson14339b62009-01-31 16:36:08 +000015148 n = PyList_GET_SIZE(keys);
15149 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015150 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015152 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015153 if (PyUnicode_READY(s) == -1) {
15154 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015155 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015157 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015158 case SSTATE_NOT_INTERNED:
15159 /* XXX Shouldn't happen */
15160 break;
15161 case SSTATE_INTERNED_IMMORTAL:
15162 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015163 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015164 break;
15165 case SSTATE_INTERNED_MORTAL:
15166 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015167 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015168 break;
15169 default:
15170 Py_FatalError("Inconsistent interned string state.");
15171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015172 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 }
15174 fprintf(stderr, "total size of all interned strings: "
15175 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15176 "mortal/immortal\n", mortal_size, immortal_size);
15177 Py_DECREF(keys);
15178 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015179 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015180}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015181
15182
15183/********************* Unicode Iterator **************************/
15184
15185typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 PyObject_HEAD
15187 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015188 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015189} unicodeiterobject;
15190
15191static void
15192unicodeiter_dealloc(unicodeiterobject *it)
15193{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015194 _PyObject_GC_UNTRACK(it);
15195 Py_XDECREF(it->it_seq);
15196 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015197}
15198
15199static int
15200unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15201{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 Py_VISIT(it->it_seq);
15203 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015204}
15205
15206static PyObject *
15207unicodeiter_next(unicodeiterobject *it)
15208{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015209 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015210
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 assert(it != NULL);
15212 seq = it->it_seq;
15213 if (seq == NULL)
15214 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015215 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015217 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15218 int kind = PyUnicode_KIND(seq);
15219 void *data = PyUnicode_DATA(seq);
15220 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15221 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015222 if (item != NULL)
15223 ++it->it_index;
15224 return item;
15225 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015226
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015228 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015230}
15231
15232static PyObject *
15233unicodeiter_len(unicodeiterobject *it)
15234{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 Py_ssize_t len = 0;
15236 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015237 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015238 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015239}
15240
15241PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15242
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015243static PyObject *
15244unicodeiter_reduce(unicodeiterobject *it)
15245{
15246 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015247 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015248 it->it_seq, it->it_index);
15249 } else {
15250 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15251 if (u == NULL)
15252 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015253 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015254 }
15255}
15256
15257PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15258
15259static PyObject *
15260unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15261{
15262 Py_ssize_t index = PyLong_AsSsize_t(state);
15263 if (index == -1 && PyErr_Occurred())
15264 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015265 if (it->it_seq != NULL) {
15266 if (index < 0)
15267 index = 0;
15268 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15269 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15270 it->it_index = index;
15271 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015272 Py_RETURN_NONE;
15273}
15274
15275PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15276
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015277static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015279 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015280 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15281 reduce_doc},
15282 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15283 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015285};
15286
15287PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15289 "str_iterator", /* tp_name */
15290 sizeof(unicodeiterobject), /* tp_basicsize */
15291 0, /* tp_itemsize */
15292 /* methods */
15293 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15294 0, /* tp_print */
15295 0, /* tp_getattr */
15296 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015297 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 0, /* tp_repr */
15299 0, /* tp_as_number */
15300 0, /* tp_as_sequence */
15301 0, /* tp_as_mapping */
15302 0, /* tp_hash */
15303 0, /* tp_call */
15304 0, /* tp_str */
15305 PyObject_GenericGetAttr, /* tp_getattro */
15306 0, /* tp_setattro */
15307 0, /* tp_as_buffer */
15308 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15309 0, /* tp_doc */
15310 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15311 0, /* tp_clear */
15312 0, /* tp_richcompare */
15313 0, /* tp_weaklistoffset */
15314 PyObject_SelfIter, /* tp_iter */
15315 (iternextfunc)unicodeiter_next, /* tp_iternext */
15316 unicodeiter_methods, /* tp_methods */
15317 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015318};
15319
15320static PyObject *
15321unicode_iter(PyObject *seq)
15322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015324
Benjamin Peterson14339b62009-01-31 16:36:08 +000015325 if (!PyUnicode_Check(seq)) {
15326 PyErr_BadInternalCall();
15327 return NULL;
15328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015329 if (PyUnicode_READY(seq) == -1)
15330 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15332 if (it == NULL)
15333 return NULL;
15334 it->it_index = 0;
15335 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015336 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 _PyObject_GC_TRACK(it);
15338 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015339}
15340
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015341
15342size_t
15343Py_UNICODE_strlen(const Py_UNICODE *u)
15344{
15345 int res = 0;
15346 while(*u++)
15347 res++;
15348 return res;
15349}
15350
15351Py_UNICODE*
15352Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15353{
15354 Py_UNICODE *u = s1;
15355 while ((*u++ = *s2++));
15356 return s1;
15357}
15358
15359Py_UNICODE*
15360Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15361{
15362 Py_UNICODE *u = s1;
15363 while ((*u++ = *s2++))
15364 if (n-- == 0)
15365 break;
15366 return s1;
15367}
15368
15369Py_UNICODE*
15370Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15371{
15372 Py_UNICODE *u1 = s1;
15373 u1 += Py_UNICODE_strlen(u1);
15374 Py_UNICODE_strcpy(u1, s2);
15375 return s1;
15376}
15377
15378int
15379Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15380{
15381 while (*s1 && *s2 && *s1 == *s2)
15382 s1++, s2++;
15383 if (*s1 && *s2)
15384 return (*s1 < *s2) ? -1 : +1;
15385 if (*s1)
15386 return 1;
15387 if (*s2)
15388 return -1;
15389 return 0;
15390}
15391
15392int
15393Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15394{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015395 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015396 for (; n != 0; n--) {
15397 u1 = *s1;
15398 u2 = *s2;
15399 if (u1 != u2)
15400 return (u1 < u2) ? -1 : +1;
15401 if (u1 == '\0')
15402 return 0;
15403 s1++;
15404 s2++;
15405 }
15406 return 0;
15407}
15408
15409Py_UNICODE*
15410Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15411{
15412 const Py_UNICODE *p;
15413 for (p = s; *p; p++)
15414 if (*p == c)
15415 return (Py_UNICODE*)p;
15416 return NULL;
15417}
15418
15419Py_UNICODE*
15420Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15421{
15422 const Py_UNICODE *p;
15423 p = s + Py_UNICODE_strlen(s);
15424 while (p != s) {
15425 p--;
15426 if (*p == c)
15427 return (Py_UNICODE*)p;
15428 }
15429 return NULL;
15430}
Victor Stinner331ea922010-08-10 16:37:20 +000015431
Victor Stinner71133ff2010-09-01 23:43:53 +000015432Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015433PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015434{
Victor Stinner577db2c2011-10-11 22:12:48 +020015435 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015436 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015438 if (!PyUnicode_Check(unicode)) {
15439 PyErr_BadArgument();
15440 return NULL;
15441 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015442 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015443 if (u == NULL)
15444 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015445 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015446 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015447 PyErr_NoMemory();
15448 return NULL;
15449 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015450 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015451 size *= sizeof(Py_UNICODE);
15452 copy = PyMem_Malloc(size);
15453 if (copy == NULL) {
15454 PyErr_NoMemory();
15455 return NULL;
15456 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015457 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015458 return copy;
15459}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015460
Georg Brandl66c221e2010-10-14 07:04:07 +000015461/* A _string module, to export formatter_parser and formatter_field_name_split
15462 to the string.Formatter class implemented in Python. */
15463
15464static PyMethodDef _string_methods[] = {
15465 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15466 METH_O, PyDoc_STR("split the argument as a field name")},
15467 {"formatter_parser", (PyCFunction) formatter_parser,
15468 METH_O, PyDoc_STR("parse the argument as a format string")},
15469 {NULL, NULL}
15470};
15471
15472static struct PyModuleDef _string_module = {
15473 PyModuleDef_HEAD_INIT,
15474 "_string",
15475 PyDoc_STR("string helper module"),
15476 0,
15477 _string_methods,
15478 NULL,
15479 NULL,
15480 NULL,
15481 NULL
15482};
15483
15484PyMODINIT_FUNC
15485PyInit__string(void)
15486{
15487 return PyModule_Create(&_string_module);
15488}
15489
15490
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015491#ifdef __cplusplus
15492}
15493#endif