blob: 0f2740630640bdd10f41ea0aa45137ee4bfbf2c7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Alexander Belopolsky40018472011-02-26 01:02:56 +0000723Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200829Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200830 Py_ssize_t size, Py_UCS4 ch,
831 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200833 switch (kind) {
834 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200835 if ((Py_UCS1) ch != ch)
836 return -1;
837 if (direction > 0)
838 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
839 else
840 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if ((Py_UCS2) ch != ch)
843 return -1;
844 if (direction > 0)
845 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
846 else
847 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200848 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200849 if (direction > 0)
850 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
851 else
852 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200853 default:
854 assert(0);
855 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857}
858
Victor Stinnerafffce42012-10-03 23:03:17 +0200859#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000860/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200861 earlier.
862
863 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
864 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
865 invalid character in Unicode 6.0. */
866static void
867unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
868{
869 int kind = PyUnicode_KIND(unicode);
870 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
871 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
872 if (length <= old_length)
873 return;
874 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
875}
876#endif
877
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878static PyObject*
879resize_compact(PyObject *unicode, Py_ssize_t length)
880{
881 Py_ssize_t char_size;
882 Py_ssize_t struct_size;
883 Py_ssize_t new_size;
884 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100885 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200886#ifdef Py_DEBUG
887 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
888#endif
889
Victor Stinner79891572012-05-03 13:43:07 +0200890 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200891 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100892 assert(PyUnicode_IS_COMPACT(unicode));
893
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200894 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100895 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200896 struct_size = sizeof(PyASCIIObject);
897 else
898 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200899 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
902 PyErr_NoMemory();
903 return NULL;
904 }
905 new_size = (struct_size + (length + 1) * char_size);
906
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200907 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_UTF8(unicode));
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911 }
Victor Stinner84def372011-12-11 20:04:56 +0100912 _Py_DEC_REFTOTAL;
913 _Py_ForgetReference(unicode);
914
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300915 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100916 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 PyErr_NoMemory();
919 return NULL;
920 }
Victor Stinner84def372011-12-11 20:04:56 +0100921 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200922 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100923
Victor Stinnerfe226c02011-10-03 03:52:20 +0200924 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200925 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100927 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200928 _PyUnicode_WSTR_LENGTH(unicode) = length;
929 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100930 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
931 PyObject_DEL(_PyUnicode_WSTR(unicode));
932 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100933 if (!PyUnicode_IS_ASCII(unicode))
934 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100935 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
937 unicode_fill_invalid(unicode, old_length);
938#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
940 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200941 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200942 return unicode;
943}
944
Alexander Belopolsky40018472011-02-26 01:02:56 +0000945static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200946resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947{
Victor Stinner95663112011-10-04 01:03:50 +0200948 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100949 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200951 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000952
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953 if (PyUnicode_IS_READY(unicode)) {
954 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200955 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200957#ifdef Py_DEBUG
958 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
959#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960
961 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200962 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200963 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
964 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965
966 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
967 PyErr_NoMemory();
968 return -1;
969 }
970 new_size = (length + 1) * char_size;
971
Victor Stinner7a9105a2011-12-12 00:13:42 +0100972 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
973 {
974 PyObject_DEL(_PyUnicode_UTF8(unicode));
975 _PyUnicode_UTF8(unicode) = NULL;
976 _PyUnicode_UTF8_LENGTH(unicode) = 0;
977 }
978
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 data = (PyObject *)PyObject_REALLOC(data, new_size);
980 if (data == NULL) {
981 PyErr_NoMemory();
982 return -1;
983 }
984 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200985 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200987 _PyUnicode_WSTR_LENGTH(unicode) = length;
988 }
989 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200990 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200991 _PyUnicode_UTF8_LENGTH(unicode) = length;
992 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 _PyUnicode_LENGTH(unicode) = length;
994 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200995#ifdef Py_DEBUG
996 unicode_fill_invalid(unicode, old_length);
997#endif
Victor Stinner95663112011-10-04 01:03:50 +0200998 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200999 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinner95663112011-10-04 01:03:50 +02001003 assert(_PyUnicode_WSTR(unicode) != NULL);
1004
1005 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001006 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001007 PyErr_NoMemory();
1008 return -1;
1009 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001010 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001011 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001012 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001013 if (!wstr) {
1014 PyErr_NoMemory();
1015 return -1;
1016 }
1017 _PyUnicode_WSTR(unicode) = wstr;
1018 _PyUnicode_WSTR(unicode)[length] = 0;
1019 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001020 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 return 0;
1022}
1023
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024static PyObject*
1025resize_copy(PyObject *unicode, Py_ssize_t length)
1026{
1027 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030
Benjamin Petersonbac79492012-01-14 13:34:47 -05001031 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001032 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033
1034 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1035 if (copy == NULL)
1036 return NULL;
1037
1038 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001039 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001041 }
1042 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001043 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 if (w == NULL)
1047 return NULL;
1048 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1049 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001050 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1051 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 }
1054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001057 Ux0000 terminated; some code (e.g. new_identifier)
1058 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001061 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062
1063*/
1064
Alexander Belopolsky40018472011-02-26 01:02:56 +00001065static PyUnicodeObject *
1066_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001068 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (length == 0 && unicode_empty != NULL) {
1073 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001074 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 }
1076
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001077 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001078 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001079 return (PyUnicodeObject *)PyErr_NoMemory();
1080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 if (length < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to _PyUnicode_New");
1084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 }
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1088 if (unicode == NULL)
1089 return NULL;
1090 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001091
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 _PyUnicode_HASH(unicode) = -1;
1094 _PyUnicode_STATE(unicode).interned = 0;
1095 _PyUnicode_STATE(unicode).kind = 0;
1096 _PyUnicode_STATE(unicode).compact = 0;
1097 _PyUnicode_STATE(unicode).ready = 0;
1098 _PyUnicode_STATE(unicode).ascii = 0;
1099 _PyUnicode_DATA_ANY(unicode) = NULL;
1100 _PyUnicode_LENGTH(unicode) = 0;
1101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1105 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001106 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001108 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110
Jeremy Hyltond8082792003-09-16 19:41:39 +00001111 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001112 * the caller fails before initializing str -- unicode_resize()
1113 * reads str[0], and the Keep-Alive optimization can keep memory
1114 * allocated for str alive across a call to unicode_dealloc(unicode).
1115 * We don't want unicode_resize to read uninitialized memory in
1116 * that case.
1117 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 _PyUnicode_WSTR(unicode)[0] = 0;
1119 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001120
Victor Stinner7931d9a2011-11-04 00:22:48 +01001121 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 return unicode;
1123}
1124
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125static const char*
1126unicode_kind_name(PyObject *unicode)
1127{
Victor Stinner42dfd712011-10-03 14:41:45 +02001128 /* don't check consistency: unicode_kind_name() is called from
1129 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130 if (!PyUnicode_IS_COMPACT(unicode))
1131 {
1132 if (!PyUnicode_IS_READY(unicode))
1133 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001134 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 {
1136 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001137 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001138 return "legacy ascii";
1139 else
1140 return "legacy latin1";
1141 case PyUnicode_2BYTE_KIND:
1142 return "legacy UCS2";
1143 case PyUnicode_4BYTE_KIND:
1144 return "legacy UCS4";
1145 default:
1146 return "<legacy invalid kind>";
1147 }
1148 }
1149 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 return "ascii";
1154 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001155 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 default:
1161 return "<invalid compact kind>";
1162 }
1163}
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166/* Functions wrapping macros for use in debugger */
1167char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001168 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169}
1170
1171void *_PyUnicode_compact_data(void *unicode) {
1172 return _PyUnicode_COMPACT_DATA(unicode);
1173}
1174void *_PyUnicode_data(void *unicode){
1175 printf("obj %p\n", unicode);
1176 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1177 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1178 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1179 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1180 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1181 return PyUnicode_DATA(unicode);
1182}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001183
1184void
1185_PyUnicode_Dump(PyObject *op)
1186{
1187 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001188 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1189 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1190 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001191
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001193 {
1194 if (ascii->state.ascii)
1195 data = (ascii + 1);
1196 else
1197 data = (compact + 1);
1198 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 else
1200 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001201 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1202 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001203
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 if (ascii->wstr == data)
1205 printf("shared ");
1206 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera3b334d2011-10-03 13:53:37 +02001208 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001209 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1211 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001212 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1213 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001216}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217#endif
1218
1219PyObject *
1220PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1221{
1222 PyObject *obj;
1223 PyCompactUnicodeObject *unicode;
1224 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001225 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001226 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 Py_ssize_t char_size;
1228 Py_ssize_t struct_size;
1229
1230 /* Optimization for empty strings */
1231 if (size == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 }
1235
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 is_ascii = 0;
1237 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 struct_size = sizeof(PyCompactUnicodeObject);
1239 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001240 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 char_size = 1;
1242 is_ascii = 1;
1243 struct_size = sizeof(PyASCIIObject);
1244 }
1245 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 }
1249 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 2;
1252 if (sizeof(wchar_t) == 2)
1253 is_sharing = 1;
1254 }
1255 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001256 if (maxchar > MAX_UNICODE) {
1257 PyErr_SetString(PyExc_SystemError,
1258 "invalid maximum character passed to PyUnicode_New");
1259 return NULL;
1260 }
Victor Stinner8f825062012-04-27 13:55:39 +02001261 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 char_size = 4;
1263 if (sizeof(wchar_t) == 4)
1264 is_sharing = 1;
1265 }
1266
1267 /* Ensure we won't overflow the size. */
1268 if (size < 0) {
1269 PyErr_SetString(PyExc_SystemError,
1270 "Negative size passed to PyUnicode_New");
1271 return NULL;
1272 }
1273 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1274 return PyErr_NoMemory();
1275
1276 /* Duplicated allocation code from _PyObject_New() instead of a call to
1277 * PyObject_New() so we are able to allocate space for the object and
1278 * it's data buffer.
1279 */
1280 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1281 if (obj == NULL)
1282 return PyErr_NoMemory();
1283 obj = PyObject_INIT(obj, &PyUnicode_Type);
1284 if (obj == NULL)
1285 return NULL;
1286
1287 unicode = (PyCompactUnicodeObject *)obj;
1288 if (is_ascii)
1289 data = ((PyASCIIObject*)obj) + 1;
1290 else
1291 data = unicode + 1;
1292 _PyUnicode_LENGTH(unicode) = size;
1293 _PyUnicode_HASH(unicode) = -1;
1294 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001295 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 _PyUnicode_STATE(unicode).compact = 1;
1297 _PyUnicode_STATE(unicode).ready = 1;
1298 _PyUnicode_STATE(unicode).ascii = is_ascii;
1299 if (is_ascii) {
1300 ((char*)data)[size] = 0;
1301 _PyUnicode_WSTR(unicode) = NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 ((char*)data)[size] = 0;
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001308 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 else {
1311 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001312 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001313 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ((Py_UCS4*)data)[size] = 0;
1317 if (is_sharing) {
1318 _PyUnicode_WSTR_LENGTH(unicode) = size;
1319 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1320 }
1321 else {
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 _PyUnicode_WSTR(unicode) = NULL;
1324 }
1325 }
Victor Stinner8f825062012-04-27 13:55:39 +02001326#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001327 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001328#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001329 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return obj;
1331}
1332
1333#if SIZEOF_WCHAR_T == 2
1334/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1335 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001336 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
1338 This function assumes that unicode can hold one more code point than wstr
1339 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001340static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001342 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343{
1344 const wchar_t *iter;
1345 Py_UCS4 *ucs4_out;
1346
Victor Stinner910337b2011-10-03 03:20:16 +02001347 assert(unicode != NULL);
1348 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1350 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1351
1352 for (iter = begin; iter < end; ) {
1353 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1354 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001355 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1356 && (iter+1) < end
1357 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 {
Victor Stinner551ac952011-11-29 22:58:13 +01001359 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 iter += 2;
1361 }
1362 else {
1363 *ucs4_out++ = *iter;
1364 iter++;
1365 }
1366 }
1367 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1368 _PyUnicode_GET_LENGTH(unicode)));
1369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370}
1371#endif
1372
Victor Stinnercd9950f2011-10-02 00:34:53 +02001373static int
Victor Stinner488fa492011-12-12 00:01:39 +01001374unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001375{
Victor Stinner488fa492011-12-12 00:01:39 +01001376 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001377 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001378 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379 return -1;
1380 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381 return 0;
1382}
1383
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001384static int
1385_copy_characters(PyObject *to, Py_ssize_t to_start,
1386 PyObject *from, Py_ssize_t from_start,
1387 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 unsigned int from_kind, to_kind;
1390 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinneree4544c2012-05-09 22:24:08 +02001392 assert(0 <= how_many);
1393 assert(0 <= from_start);
1394 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinnerd3f08822012-05-29 12:57:52 +02001399 assert(PyUnicode_Check(to));
1400 assert(PyUnicode_IS_READY(to));
1401 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1402
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001403 if (how_many == 0)
1404 return 0;
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001407 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001409 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerf1852262012-06-16 16:38:26 +02001411#ifdef Py_DEBUG
1412 if (!check_maxchar
1413 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1414 {
1415 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1416 Py_UCS4 ch;
1417 Py_ssize_t i;
1418 for (i=0; i < how_many; i++) {
1419 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1420 assert(ch <= to_maxchar);
1421 }
1422 }
1423#endif
1424
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001425 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001426 if (check_maxchar
1427 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1428 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 /* Writing Latin-1 characters into an ASCII string requires to
1430 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 Py_UCS4 max_char;
1432 max_char = ucs1lib_find_max_char(from_data,
1433 (Py_UCS1*)from_data + how_many);
1434 if (max_char >= 128)
1435 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001437 Py_MEMCPY((char*)to_data + to_kind * to_start,
1438 (char*)from_data + from_kind * from_start,
1439 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 else if (from_kind == PyUnicode_1BYTE_KIND
1442 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 {
1444 _PyUnicode_CONVERT_BYTES(
1445 Py_UCS1, Py_UCS2,
1446 PyUnicode_1BYTE_DATA(from) + from_start,
1447 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448 PyUnicode_2BYTE_DATA(to) + to_start
1449 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001452 && to_kind == PyUnicode_4BYTE_KIND)
1453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS4,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_4BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
1461 else if (from_kind == PyUnicode_2BYTE_KIND
1462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS2, Py_UCS4,
1466 PyUnicode_2BYTE_DATA(from) + from_start,
1467 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001471 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001472 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1473
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001474 if (!check_maxchar) {
1475 if (from_kind == PyUnicode_2BYTE_KIND
1476 && to_kind == PyUnicode_1BYTE_KIND)
1477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS2, Py_UCS1,
1480 PyUnicode_2BYTE_DATA(from) + from_start,
1481 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_1BYTE_DATA(to) + to_start
1483 );
1484 }
1485 else if (from_kind == PyUnicode_4BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS4, Py_UCS1,
1490 PyUnicode_4BYTE_DATA(from) + from_start,
1491 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_2BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS2,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_2BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else {
1506 assert(0);
1507 return -1;
1508 }
1509 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001511 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 Py_ssize_t i;
1514
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 for (i=0; i < how_many; i++) {
1516 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 if (ch > to_maxchar)
1518 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1520 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 }
1522 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523 return 0;
1524}
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526void
1527_PyUnicode_FastCopyCharacters(
1528 PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530{
1531 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1532}
1533
1534Py_ssize_t
1535PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1536 PyObject *from, Py_ssize_t from_start,
1537 Py_ssize_t how_many)
1538{
1539 int err;
1540
1541 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1542 PyErr_BadInternalCall();
1543 return -1;
1544 }
1545
Benjamin Petersonbac79492012-01-14 13:34:47 -05001546 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001548 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001549 return -1;
1550
Victor Stinnerd3f08822012-05-29 12:57:52 +02001551 if (from_start < 0) {
1552 PyErr_SetString(PyExc_IndexError, "string index out of range");
1553 return -1;
1554 }
1555 if (to_start < 0) {
1556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1560 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1561 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001562 "Cannot write %zi characters at %zi "
1563 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 how_many, to_start, PyUnicode_GET_LENGTH(to));
1565 return -1;
1566 }
1567
1568 if (how_many == 0)
1569 return 0;
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001572 return -1;
1573
1574 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1575 if (err) {
1576 PyErr_Format(PyExc_SystemError,
1577 "Cannot copy %s characters "
1578 "into a string of %s characters",
1579 unicode_kind_name(from),
1580 unicode_kind_name(to));
1581 return -1;
1582 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001583 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584}
1585
Victor Stinner17222162011-09-28 22:15:37 +02001586/* Find the maximum code point and count the number of surrogate pairs so a
1587 correct string length can be computed before converting a string to UCS4.
1588 This function counts single surrogates as a character and not as a pair.
1589
1590 Return 0 on success, or -1 on error. */
1591static int
1592find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1593 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594{
1595 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001596 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerc53be962011-10-02 21:33:54 +02001598 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 *num_surrogates = 0;
1600 *maxchar = 0;
1601
1602 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001604 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1605 && (iter+1) < end
1606 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1607 {
1608 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1609 ++(*num_surrogates);
1610 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 }
1612 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001614 {
1615 ch = *iter;
1616 iter++;
1617 }
1618 if (ch > *maxchar) {
1619 *maxchar = ch;
1620 if (*maxchar > MAX_UNICODE) {
1621 PyErr_Format(PyExc_ValueError,
1622 "character U+%x is not in range [U+0000; U+10ffff]",
1623 ch);
1624 return -1;
1625 }
1626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 }
1628 return 0;
1629}
1630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001631int
1632_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633{
1634 wchar_t *end;
1635 Py_UCS4 maxchar = 0;
1636 Py_ssize_t num_surrogates;
1637#if SIZEOF_WCHAR_T == 2
1638 Py_ssize_t length_wo_surrogates;
1639#endif
1640
Georg Brandl7597add2011-10-05 16:36:47 +02001641 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001642 strings were created using _PyObject_New() and where no canonical
1643 representation (the str field) has been set yet aka strings
1644 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001645 assert(_PyUnicode_CHECK(unicode));
1646 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001648 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001649 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001650 /* Actually, it should neither be interned nor be anything else: */
1651 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001654 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
1658 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001659 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1660 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 PyErr_NoMemory();
1662 return -1;
1663 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001664 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 _PyUnicode_WSTR(unicode), end,
1666 PyUnicode_1BYTE_DATA(unicode));
1667 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1668 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1669 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1670 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001671 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001672 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001673 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 }
1675 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001677 _PyUnicode_UTF8(unicode) = NULL;
1678 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 PyObject_FREE(_PyUnicode_WSTR(unicode));
1681 _PyUnicode_WSTR(unicode) = NULL;
1682 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1683 }
1684 /* In this case we might have to convert down from 4-byte native
1685 wchar_t to 2-byte unicode. */
1686 else if (maxchar < 65536) {
1687 assert(num_surrogates == 0 &&
1688 "FindMaxCharAndNumSurrogatePairs() messed up");
1689
Victor Stinner506f5922011-09-28 22:34:18 +02001690#if SIZEOF_WCHAR_T == 2
1691 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001693 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1694 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1695 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001696 _PyUnicode_UTF8(unicode) = NULL;
1697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001698#else
1699 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001700 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001701 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001702 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001703 PyErr_NoMemory();
1704 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 }
Victor Stinner506f5922011-09-28 22:34:18 +02001706 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1707 _PyUnicode_WSTR(unicode), end,
1708 PyUnicode_2BYTE_DATA(unicode));
1709 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1710 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1711 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001712 _PyUnicode_UTF8(unicode) = NULL;
1713 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001714 PyObject_FREE(_PyUnicode_WSTR(unicode));
1715 _PyUnicode_WSTR(unicode) = NULL;
1716 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1717#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1720 else {
1721#if SIZEOF_WCHAR_T == 2
1722 /* in case the native representation is 2-bytes, we need to allocate a
1723 new normalized 4-byte version. */
1724 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001725 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1726 PyErr_NoMemory();
1727 return -1;
1728 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001729 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1730 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 PyErr_NoMemory();
1732 return -1;
1733 }
1734 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1735 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001736 _PyUnicode_UTF8(unicode) = NULL;
1737 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001738 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1739 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001740 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyObject_FREE(_PyUnicode_WSTR(unicode));
1742 _PyUnicode_WSTR(unicode) = NULL;
1743 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1744#else
1745 assert(num_surrogates == 0);
1746
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001749 _PyUnicode_UTF8(unicode) = NULL;
1750 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1752#endif
1753 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1754 }
1755 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001756 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return 0;
1758}
1759
Alexander Belopolsky40018472011-02-26 01:02:56 +00001760static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001761unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762{
Walter Dörwald16807132007-05-25 13:52:07 +00001763 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 case SSTATE_NOT_INTERNED:
1765 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001766
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 case SSTATE_INTERNED_MORTAL:
1768 /* revive dead object temporarily for DelItem */
1769 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001770 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 Py_FatalError(
1772 "deletion of interned string failed");
1773 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001774
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 case SSTATE_INTERNED_IMMORTAL:
1776 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 default:
1779 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001780 }
1781
Victor Stinner03490912011-10-03 23:45:12 +02001782 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001784 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001785 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001786 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1787 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001789 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790}
1791
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001792#ifdef Py_DEBUG
1793static int
1794unicode_is_singleton(PyObject *unicode)
1795{
1796 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1797 if (unicode == unicode_empty)
1798 return 1;
1799 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1800 {
1801 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1802 if (ch < 256 && unicode_latin1[ch] == unicode)
1803 return 1;
1804 }
1805 return 0;
1806}
1807#endif
1808
Alexander Belopolsky40018472011-02-26 01:02:56 +00001809static int
Victor Stinner488fa492011-12-12 00:01:39 +01001810unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001811{
Victor Stinner488fa492011-12-12 00:01:39 +01001812 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001813 if (Py_REFCNT(unicode) != 1)
1814 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001815 if (_PyUnicode_HASH(unicode) != -1)
1816 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 if (PyUnicode_CHECK_INTERNED(unicode))
1818 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001819 if (!PyUnicode_CheckExact(unicode))
1820 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001821#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001822 /* singleton refcount is greater than 1 */
1823 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001824#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001825 return 1;
1826}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001827
Victor Stinnerfe226c02011-10-03 03:52:20 +02001828static int
1829unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1830{
1831 PyObject *unicode;
1832 Py_ssize_t old_length;
1833
1834 assert(p_unicode != NULL);
1835 unicode = *p_unicode;
1836
1837 assert(unicode != NULL);
1838 assert(PyUnicode_Check(unicode));
1839 assert(0 <= length);
1840
Victor Stinner910337b2011-10-03 03:20:16 +02001841 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001842 old_length = PyUnicode_WSTR_LENGTH(unicode);
1843 else
1844 old_length = PyUnicode_GET_LENGTH(unicode);
1845 if (old_length == length)
1846 return 0;
1847
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001848 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001849 _Py_INCREF_UNICODE_EMPTY();
1850 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001852 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 return 0;
1854 }
1855
Victor Stinner488fa492011-12-12 00:01:39 +01001856 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001857 PyObject *copy = resize_copy(unicode, length);
1858 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001860 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001862 }
1863
Victor Stinnerfe226c02011-10-03 03:52:20 +02001864 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001865 PyObject *new_unicode = resize_compact(unicode, length);
1866 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001867 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001868 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001870 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001871 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001875PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001876{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *unicode;
1878 if (p_unicode == NULL) {
1879 PyErr_BadInternalCall();
1880 return -1;
1881 }
1882 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001883 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 {
1885 PyErr_BadInternalCall();
1886 return -1;
1887 }
1888 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001889}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001890
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001891/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001892
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001893 WARNING: The function doesn't copy the terminating null character and
1894 doesn't check the maximum character (may write a latin1 character in an
1895 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001896static void
1897unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1898 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001899{
1900 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1901 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001902 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001903
1904 switch (kind) {
1905 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001906 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001907#ifdef Py_DEBUG
1908 if (PyUnicode_IS_ASCII(unicode)) {
1909 Py_UCS4 maxchar = ucs1lib_find_max_char(
1910 (const Py_UCS1*)str,
1911 (const Py_UCS1*)str + len);
1912 assert(maxchar < 128);
1913 }
1914#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001915 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001916 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 }
1918 case PyUnicode_2BYTE_KIND: {
1919 Py_UCS2 *start = (Py_UCS2 *)data + index;
1920 Py_UCS2 *ucs2 = start;
1921 assert(index <= PyUnicode_GET_LENGTH(unicode));
1922
Victor Stinner184252a2012-06-16 02:57:41 +02001923 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001924 *ucs2 = (Py_UCS2)*str;
1925
1926 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001927 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 }
1929 default: {
1930 Py_UCS4 *start = (Py_UCS4 *)data + index;
1931 Py_UCS4 *ucs4 = start;
1932 assert(kind == PyUnicode_4BYTE_KIND);
1933 assert(index <= PyUnicode_GET_LENGTH(unicode));
1934
Victor Stinner184252a2012-06-16 02:57:41 +02001935 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001936 *ucs4 = (Py_UCS4)*str;
1937
1938 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 }
1941}
1942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943static PyObject*
1944get_latin1_char(unsigned char ch)
1945{
Victor Stinnera464fc12011-10-02 20:39:30 +02001946 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001948 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 if (!unicode)
1950 return NULL;
1951 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001952 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 unicode_latin1[ch] = unicode;
1954 }
1955 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001956 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957}
1958
Victor Stinner985a82a2014-01-03 12:53:47 +01001959static PyObject*
1960unicode_char(Py_UCS4 ch)
1961{
1962 PyObject *unicode;
1963
1964 assert(ch <= MAX_UNICODE);
1965
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001966 if (ch < 256)
1967 return get_latin1_char(ch);
1968
Victor Stinner985a82a2014-01-03 12:53:47 +01001969 unicode = PyUnicode_New(1, ch);
1970 if (unicode == NULL)
1971 return NULL;
1972 switch (PyUnicode_KIND(unicode)) {
1973 case PyUnicode_1BYTE_KIND:
1974 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1975 break;
1976 case PyUnicode_2BYTE_KIND:
1977 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1978 break;
1979 default:
1980 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1981 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1982 }
1983 assert(_PyUnicode_CheckConsistency(unicode, 1));
1984 return unicode;
1985}
1986
Alexander Belopolsky40018472011-02-26 01:02:56 +00001987PyObject *
1988PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001990 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 Py_UCS4 maxchar = 0;
1992 Py_ssize_t num_surrogates;
1993
1994 if (u == NULL)
1995 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001997 /* If the Unicode data is known at construction time, we can apply
1998 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002001 if (size == 0)
2002 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 /* Single character Unicode objects in the Latin-1 range are
2005 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002006 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return get_latin1_char((unsigned char)*u);
2008
2009 /* If not empty and not single character, copy the Unicode data
2010 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002011 if (find_maxchar_surrogates(u, u + size,
2012 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014
Victor Stinner8faf8212011-12-08 22:14:11 +01002015 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 if (!unicode)
2017 return NULL;
2018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 switch (PyUnicode_KIND(unicode)) {
2020 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002021 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2023 break;
2024 case PyUnicode_2BYTE_KIND:
2025#if Py_UNICODE_SIZE == 2
2026 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2027#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002028 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2030#endif
2031 break;
2032 case PyUnicode_4BYTE_KIND:
2033#if SIZEOF_WCHAR_T == 2
2034 /* This is the only case which has to process surrogates, thus
2035 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002036 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037#else
2038 assert(num_surrogates == 0);
2039 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2040#endif
2041 break;
2042 default:
2043 assert(0 && "Impossible state");
2044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002046 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047}
2048
Alexander Belopolsky40018472011-02-26 01:02:56 +00002049PyObject *
2050PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002052 if (size < 0) {
2053 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 return NULL;
2056 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002057 if (u != NULL)
2058 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2059 else
2060 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002061}
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063PyObject *
2064PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002065{
2066 size_t size = strlen(u);
2067 if (size > PY_SSIZE_T_MAX) {
2068 PyErr_SetString(PyExc_OverflowError, "input too long");
2069 return NULL;
2070 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002071 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002072}
2073
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002074PyObject *
2075_PyUnicode_FromId(_Py_Identifier *id)
2076{
2077 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002078 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2079 strlen(id->string),
2080 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002081 if (!id->object)
2082 return NULL;
2083 PyUnicode_InternInPlace(&id->object);
2084 assert(!id->next);
2085 id->next = static_strings;
2086 static_strings = id;
2087 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002088 return id->object;
2089}
2090
2091void
2092_PyUnicode_ClearStaticStrings()
2093{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002094 _Py_Identifier *tmp, *s = static_strings;
2095 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002096 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002097 tmp = s->next;
2098 s->next = NULL;
2099 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002100 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002101 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002102}
2103
Benjamin Peterson0df54292012-03-26 14:50:32 -04002104/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002105
Victor Stinnerd3f08822012-05-29 12:57:52 +02002106PyObject*
2107_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002108{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002109 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002110 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002111 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002112#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002113 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002114#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002115 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 }
Victor Stinner785938e2011-12-11 20:09:03 +01002117 unicode = PyUnicode_New(size, 127);
2118 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002119 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002120 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2121 assert(_PyUnicode_CheckConsistency(unicode, 1));
2122 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002123}
2124
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002125static Py_UCS4
2126kind_maxchar_limit(unsigned int kind)
2127{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002128 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002129 case PyUnicode_1BYTE_KIND:
2130 return 0x80;
2131 case PyUnicode_2BYTE_KIND:
2132 return 0x100;
2133 case PyUnicode_4BYTE_KIND:
2134 return 0x10000;
2135 default:
2136 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002137 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002138 }
2139}
2140
Victor Stinnere6abb482012-05-02 01:15:40 +02002141Py_LOCAL_INLINE(Py_UCS4)
2142align_maxchar(Py_UCS4 maxchar)
2143{
2144 if (maxchar <= 127)
2145 return 127;
2146 else if (maxchar <= 255)
2147 return 255;
2148 else if (maxchar <= 65535)
2149 return 65535;
2150 else
2151 return MAX_UNICODE;
2152}
2153
Victor Stinner702c7342011-10-05 13:50:52 +02002154static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002155_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002158 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002159
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002162 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002163 if (size == 1)
2164 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002165
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002166 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002167 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 if (!res)
2169 return NULL;
2170 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002171 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002173}
2174
Victor Stinnere57b1c02011-09-28 22:20:48 +02002175static PyObject*
2176_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177{
2178 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002180
Serhiy Storchaka678db842013-01-26 12:16:36 +02002181 if (size == 0)
2182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002183 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002184 if (size == 1)
2185 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002187 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002188 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 if (!res)
2190 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002193 else {
2194 _PyUnicode_CONVERT_BYTES(
2195 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2196 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002197 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return res;
2199}
2200
Victor Stinnere57b1c02011-09-28 22:20:48 +02002201static PyObject*
2202_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203{
2204 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002206
Serhiy Storchaka678db842013-01-26 12:16:36 +02002207 if (size == 0)
2208 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002209 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002210 if (size == 1)
2211 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002212
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002213 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002214 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 if (!res)
2216 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002217 if (max_char < 256)
2218 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2219 PyUnicode_1BYTE_DATA(res));
2220 else if (max_char < 0x10000)
2221 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2222 PyUnicode_2BYTE_DATA(res));
2223 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002225 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 return res;
2227}
2228
2229PyObject*
2230PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2231{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002232 if (size < 0) {
2233 PyErr_SetString(PyExc_ValueError, "size must be positive");
2234 return NULL;
2235 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002236 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002240 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002242 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 PyErr_SetString(PyExc_SystemError, "invalid kind");
2245 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247}
2248
Victor Stinnerece58de2012-04-23 23:36:38 +02002249Py_UCS4
2250_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2251{
2252 enum PyUnicode_Kind kind;
2253 void *startptr, *endptr;
2254
2255 assert(PyUnicode_IS_READY(unicode));
2256 assert(0 <= start);
2257 assert(end <= PyUnicode_GET_LENGTH(unicode));
2258 assert(start <= end);
2259
2260 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2261 return PyUnicode_MAX_CHAR_VALUE(unicode);
2262
2263 if (start == end)
2264 return 127;
2265
Victor Stinner94d558b2012-04-27 22:26:58 +02002266 if (PyUnicode_IS_ASCII(unicode))
2267 return 127;
2268
Victor Stinnerece58de2012-04-23 23:36:38 +02002269 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002270 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002271 endptr = (char *)startptr + end * kind;
2272 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002273 switch(kind) {
2274 case PyUnicode_1BYTE_KIND:
2275 return ucs1lib_find_max_char(startptr, endptr);
2276 case PyUnicode_2BYTE_KIND:
2277 return ucs2lib_find_max_char(startptr, endptr);
2278 case PyUnicode_4BYTE_KIND:
2279 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002280 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002281 assert(0);
2282 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002283 }
2284}
2285
Victor Stinner25a4b292011-10-06 12:31:55 +02002286/* Ensure that a string uses the most efficient storage, if it is not the
2287 case: create a new string with of the right kind. Write NULL into *p_unicode
2288 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002289static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002290unicode_adjust_maxchar(PyObject **p_unicode)
2291{
2292 PyObject *unicode, *copy;
2293 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002294 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002295 unsigned int kind;
2296
2297 assert(p_unicode != NULL);
2298 unicode = *p_unicode;
2299 assert(PyUnicode_IS_READY(unicode));
2300 if (PyUnicode_IS_ASCII(unicode))
2301 return;
2302
2303 len = PyUnicode_GET_LENGTH(unicode);
2304 kind = PyUnicode_KIND(unicode);
2305 if (kind == PyUnicode_1BYTE_KIND) {
2306 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002307 max_char = ucs1lib_find_max_char(u, u + len);
2308 if (max_char >= 128)
2309 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002310 }
2311 else if (kind == PyUnicode_2BYTE_KIND) {
2312 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs2lib_find_max_char(u, u + len);
2314 if (max_char >= 256)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
2317 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002319 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs4lib_find_max_char(u, u + len);
2321 if (max_char >= 0x10000)
2322 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002323 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002325 if (copy != NULL)
2326 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002327 Py_DECREF(unicode);
2328 *p_unicode = copy;
2329}
2330
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002332_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002333{
Victor Stinner87af4f22011-11-21 23:03:47 +01002334 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002335 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336
Victor Stinner034f6cf2011-09-30 02:26:44 +02002337 if (!PyUnicode_Check(unicode)) {
2338 PyErr_BadInternalCall();
2339 return NULL;
2340 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002341 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002343
Victor Stinner87af4f22011-11-21 23:03:47 +01002344 length = PyUnicode_GET_LENGTH(unicode);
2345 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 if (!copy)
2347 return NULL;
2348 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2349
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2351 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002352 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002353 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002354}
2355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356
Victor Stinnerbc603d12011-10-02 01:00:40 +02002357/* Widen Unicode objects to larger buffers. Don't write terminating null
2358 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359
2360void*
2361_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2362{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002363 Py_ssize_t len;
2364 void *result;
2365 unsigned int skind;
2366
Benjamin Petersonbac79492012-01-14 13:34:47 -05002367 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 return NULL;
2369
2370 len = PyUnicode_GET_LENGTH(s);
2371 skind = PyUnicode_KIND(s);
2372 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002373 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 return NULL;
2375 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002376 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002377 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002378 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002379 if (!result)
2380 return PyErr_NoMemory();
2381 assert(skind == PyUnicode_1BYTE_KIND);
2382 _PyUnicode_CONVERT_BYTES(
2383 Py_UCS1, Py_UCS2,
2384 PyUnicode_1BYTE_DATA(s),
2385 PyUnicode_1BYTE_DATA(s) + len,
2386 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002388 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002389 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002390 if (!result)
2391 return PyErr_NoMemory();
2392 if (skind == PyUnicode_2BYTE_KIND) {
2393 _PyUnicode_CONVERT_BYTES(
2394 Py_UCS2, Py_UCS4,
2395 PyUnicode_2BYTE_DATA(s),
2396 PyUnicode_2BYTE_DATA(s) + len,
2397 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002399 else {
2400 assert(skind == PyUnicode_1BYTE_KIND);
2401 _PyUnicode_CONVERT_BYTES(
2402 Py_UCS1, Py_UCS4,
2403 PyUnicode_1BYTE_DATA(s),
2404 PyUnicode_1BYTE_DATA(s) + len,
2405 result);
2406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002408 default:
2409 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 }
Victor Stinner01698042011-10-04 00:04:26 +02002411 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return NULL;
2413}
2414
2415static Py_UCS4*
2416as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2417 int copy_null)
2418{
2419 int kind;
2420 void *data;
2421 Py_ssize_t len, targetlen;
2422 if (PyUnicode_READY(string) == -1)
2423 return NULL;
2424 kind = PyUnicode_KIND(string);
2425 data = PyUnicode_DATA(string);
2426 len = PyUnicode_GET_LENGTH(string);
2427 targetlen = len;
2428 if (copy_null)
2429 targetlen++;
2430 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002431 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 if (!target) {
2433 PyErr_NoMemory();
2434 return NULL;
2435 }
2436 }
2437 else {
2438 if (targetsize < targetlen) {
2439 PyErr_Format(PyExc_SystemError,
2440 "string is longer than the buffer");
2441 if (copy_null && 0 < targetsize)
2442 target[0] = 0;
2443 return NULL;
2444 }
2445 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002446 if (kind == PyUnicode_1BYTE_KIND) {
2447 Py_UCS1 *start = (Py_UCS1 *) data;
2448 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002450 else if (kind == PyUnicode_2BYTE_KIND) {
2451 Py_UCS2 *start = (Py_UCS2 *) data;
2452 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2453 }
2454 else {
2455 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (copy_null)
2459 target[len] = 0;
2460 return target;
2461}
2462
2463Py_UCS4*
2464PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002467 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 PyErr_BadInternalCall();
2469 return NULL;
2470 }
2471 return as_ucs4(string, target, targetsize, copy_null);
2472}
2473
2474Py_UCS4*
2475PyUnicode_AsUCS4Copy(PyObject *string)
2476{
2477 return as_ucs4(string, NULL, 0, 1);
2478}
2479
2480#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002481
Alexander Belopolsky40018472011-02-26 01:02:56 +00002482PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002483PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002487 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 PyErr_BadInternalCall();
2489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
2491
Martin v. Löwis790465f2008-04-05 20:41:37 +00002492 if (size == -1) {
2493 size = wcslen(w);
2494 }
2495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497}
2498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002500
Victor Stinner15a11362012-10-06 23:48:20 +02002501/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002502 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2503 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2504#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002505
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002506static int
2507unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2508 Py_ssize_t width, Py_ssize_t precision)
2509{
2510 Py_ssize_t length, fill, arglen;
2511 Py_UCS4 maxchar;
2512
2513 if (PyUnicode_READY(str) == -1)
2514 return -1;
2515
2516 length = PyUnicode_GET_LENGTH(str);
2517 if ((precision == -1 || precision >= length)
2518 && width <= length)
2519 return _PyUnicodeWriter_WriteStr(writer, str);
2520
2521 if (precision != -1)
2522 length = Py_MIN(precision, length);
2523
2524 arglen = Py_MAX(length, width);
2525 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2526 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2527 else
2528 maxchar = writer->maxchar;
2529
2530 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2531 return -1;
2532
2533 if (width > length) {
2534 fill = width - length;
2535 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2536 return -1;
2537 writer->pos += fill;
2538 }
2539
2540 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2541 str, 0, length);
2542 writer->pos += length;
2543 return 0;
2544}
2545
2546static int
2547unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2548 Py_ssize_t width, Py_ssize_t precision)
2549{
2550 /* UTF-8 */
2551 Py_ssize_t length;
2552 PyObject *unicode;
2553 int res;
2554
2555 length = strlen(str);
2556 if (precision != -1)
2557 length = Py_MIN(length, precision);
2558 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2559 if (unicode == NULL)
2560 return -1;
2561
2562 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2563 Py_DECREF(unicode);
2564 return res;
2565}
2566
Victor Stinner96865452011-03-01 23:44:09 +00002567static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002568unicode_fromformat_arg(_PyUnicodeWriter *writer,
2569 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002570{
Victor Stinnere215d962012-10-06 23:03:36 +02002571 const char *p;
2572 Py_ssize_t len;
2573 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width;
2575 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 int longflag;
2577 int longlongflag;
2578 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580
2581 p = f;
2582 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002583 zeropad = 0;
2584 if (*f == '0') {
2585 zeropad = 1;
2586 f++;
2587 }
Victor Stinner96865452011-03-01 23:44:09 +00002588
2589 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 width = -1;
2591 if (Py_ISDIGIT((unsigned)*f)) {
2592 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002593 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002596 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002597 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002598 return NULL;
2599 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002601 f++;
2602 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002603 }
2604 precision = -1;
2605 if (*f == '.') {
2606 f++;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 precision = (*f - '0');
2609 f++;
2610 while (Py_ISDIGIT((unsigned)*f)) {
2611 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2612 PyErr_SetString(PyExc_ValueError,
2613 "precision too big");
2614 return NULL;
2615 }
2616 precision = (precision * 10) + (*f - '0');
2617 f++;
2618 }
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620 if (*f == '%') {
2621 /* "%.3%s" => f points to "3" */
2622 f--;
2623 }
2624 }
2625 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002626 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002627 f--;
2628 }
Victor Stinner96865452011-03-01 23:44:09 +00002629
2630 /* Handle %ld, %lu, %lld and %llu. */
2631 longflag = 0;
2632 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002633 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002634 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002635 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002636 longflag = 1;
2637 ++f;
2638 }
2639#ifdef HAVE_LONG_LONG
2640 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 longlongflag = 1;
2643 f += 2;
2644 }
2645#endif
2646 }
2647 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002648 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002649 size_tflag = 1;
2650 ++f;
2651 }
Victor Stinnere215d962012-10-06 23:03:36 +02002652
2653 if (f[1] == '\0')
2654 writer->overallocate = 0;
2655
2656 switch (*f) {
2657 case 'c':
2658 {
2659 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002660 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002661 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002662 "character argument not in range(0x110000)");
2663 return NULL;
2664 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002665 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002666 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002667 break;
2668 }
2669
2670 case 'i':
2671 case 'd':
2672 case 'u':
2673 case 'x':
2674 {
2675 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002676 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002677 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002678
2679 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002680 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002681 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002682 va_arg(*vargs, unsigned long));
2683#ifdef HAVE_LONG_LONG
2684 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, unsigned PY_LONG_LONG));
2687#endif
2688 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_arg(*vargs, size_t));
2691 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002692 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002693 va_arg(*vargs, unsigned int));
2694 }
2695 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 }
2698 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002699 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002700 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002701 va_arg(*vargs, long));
2702#ifdef HAVE_LONG_LONG
2703 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, PY_LONG_LONG));
2706#endif
2707 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002709 va_arg(*vargs, Py_ssize_t));
2710 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, int));
2713 }
2714 assert(len >= 0);
2715
Victor Stinnere215d962012-10-06 23:03:36 +02002716 if (precision < len)
2717 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002718
2719 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2721 return NULL;
2722
Victor Stinnere215d962012-10-06 23:03:36 +02002723 if (width > precision) {
2724 Py_UCS4 fillchar;
2725 fill = width - precision;
2726 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002727 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2728 return NULL;
2729 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002730 }
Victor Stinner15a11362012-10-06 23:48:20 +02002731 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002732 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002733 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2734 return NULL;
2735 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002736 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002737
Victor Stinner4a587072013-11-19 12:54:53 +01002738 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2739 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 break;
2741 }
2742
2743 case 'p':
2744 {
2745 char number[MAX_LONG_LONG_CHARS];
2746
2747 len = sprintf(number, "%p", va_arg(*vargs, void*));
2748 assert(len >= 0);
2749
2750 /* %p is ill-defined: ensure leading 0x. */
2751 if (number[1] == 'X')
2752 number[1] = 'x';
2753 else if (number[1] != 'x') {
2754 memmove(number + 2, number,
2755 strlen(number) + 1);
2756 number[0] = '0';
2757 number[1] = 'x';
2758 len += 2;
2759 }
2760
Victor Stinner4a587072013-11-19 12:54:53 +01002761 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002762 return NULL;
2763 break;
2764 }
2765
2766 case 's':
2767 {
2768 /* UTF-8 */
2769 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002770 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002771 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002772 break;
2773 }
2774
2775 case 'U':
2776 {
2777 PyObject *obj = va_arg(*vargs, PyObject *);
2778 assert(obj && _PyUnicode_CHECK(obj));
2779
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
2782 break;
2783 }
2784
2785 case 'V':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002789 if (obj) {
2790 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
2793 }
2794 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 assert(str != NULL);
2796 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002797 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002798 }
2799 break;
2800 }
2801
2802 case 'S':
2803 {
2804 PyObject *obj = va_arg(*vargs, PyObject *);
2805 PyObject *str;
2806 assert(obj);
2807 str = PyObject_Str(obj);
2808 if (!str)
2809 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002811 Py_DECREF(str);
2812 return NULL;
2813 }
2814 Py_DECREF(str);
2815 break;
2816 }
2817
2818 case 'R':
2819 {
2820 PyObject *obj = va_arg(*vargs, PyObject *);
2821 PyObject *repr;
2822 assert(obj);
2823 repr = PyObject_Repr(obj);
2824 if (!repr)
2825 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002826 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002827 Py_DECREF(repr);
2828 return NULL;
2829 }
2830 Py_DECREF(repr);
2831 break;
2832 }
2833
2834 case 'A':
2835 {
2836 PyObject *obj = va_arg(*vargs, PyObject *);
2837 PyObject *ascii;
2838 assert(obj);
2839 ascii = PyObject_ASCII(obj);
2840 if (!ascii)
2841 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002842 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002843 Py_DECREF(ascii);
2844 return NULL;
2845 }
2846 Py_DECREF(ascii);
2847 break;
2848 }
2849
2850 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002851 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002852 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002853 break;
2854
2855 default:
2856 /* if we stumble upon an unknown formatting code, copy the rest
2857 of the format string to the output string. (we cannot just
2858 skip the code, since there's no way to know what's in the
2859 argument list) */
2860 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002861 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
2863 f = p+len;
2864 return f;
2865 }
2866
2867 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002868 return f;
2869}
2870
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871PyObject *
2872PyUnicode_FromFormatV(const char *format, va_list vargs)
2873{
Victor Stinnere215d962012-10-06 23:03:36 +02002874 va_list vargs2;
2875 const char *f;
2876 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002877
Victor Stinner8f674cc2013-04-17 23:02:17 +02002878 _PyUnicodeWriter_Init(&writer);
2879 writer.min_length = strlen(format) + 100;
2880 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002881
2882 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2883 Copy it to be able to pass a reference to a subfunction. */
2884 Py_VA_COPY(vargs2, vargs);
2885
2886 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002887 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002888 f = unicode_fromformat_arg(&writer, f, &vargs2);
2889 if (f == NULL)
2890 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002892 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002893 const char *p;
2894 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002895
Victor Stinnere215d962012-10-06 23:03:36 +02002896 p = f;
2897 do
2898 {
2899 if ((unsigned char)*p > 127) {
2900 PyErr_Format(PyExc_ValueError,
2901 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2902 "string, got a non-ASCII byte: 0x%02x",
2903 (unsigned char)*p);
2904 return NULL;
2905 }
2906 p++;
2907 }
2908 while (*p != '\0' && *p != '%');
2909 len = p - f;
2910
2911 if (*p == '\0')
2912 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002913
2914 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002915 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002916
2917 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002918 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002919 }
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return _PyUnicodeWriter_Finish(&writer);
2921
2922 fail:
2923 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925}
2926
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 PyObject* ret;
2931 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932
2933#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#ifdef HAVE_WCHAR_H
2944
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) required to convert the unicode object. Ignore size argument.
2950
Victor Stinnerd88d9832011-09-06 02:00:05 +02002951 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002953 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 wchar_t *w,
2957 Py_ssize_t size)
2958{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 const wchar_t *wstr;
2961
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (wstr == NULL)
2964 return -1;
2965
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 return res;
2973 }
2974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002976}
2977
2978Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002979PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 wchar_t *w,
2981 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982{
2983 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyErr_BadInternalCall();
2985 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003002 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 if (buflen == -1)
3004 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003005 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003010 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003014 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 if (size != NULL)
3016 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 return buffer;
3018}
3019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024{
Victor Stinner8faf8212011-12-08 22:14:11 +01003025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 PyErr_SetString(PyExc_ValueError,
3027 "chr() arg not in range(0x110000)");
3028 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003029 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003030
Victor Stinner985a82a2014-01-03 12:53:47 +01003031 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032}
3033
Alexander Belopolsky40018472011-02-26 01:02:56 +00003034PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003035PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003040 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003041 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 Py_INCREF(obj);
3043 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
3045 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 /* For a Unicode subtype that's not a Unicode object,
3047 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003048 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003049 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003050 PyErr_Format(PyExc_TypeError,
3051 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003052 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003053 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003057PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_BadInternalCall();
3066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 /* Decoding bytes objects is the most common case and should be fast */
3070 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 if (PyBytes_GET_SIZE(obj) == 0)
3072 _Py_RETURN_UNICODE_EMPTY();
3073 v = PyUnicode_Decode(
3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 return v;
3077 }
3078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 PyErr_SetString(PyExc_TypeError,
3081 "decoding str is not supported");
3082 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003084
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003088 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 Py_TYPE(obj)->tp_name);
3090 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003091 }
Tim Petersced69f82003-09-16 20:30:58 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003094 PyBuffer_Release(&buffer);
3095 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003097
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003100 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101}
3102
Victor Stinner942889a2016-09-05 15:40:10 -07003103/* Normalize an encoding name: C implementation of
3104 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3105 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003106int
3107_Py_normalize_encoding(const char *encoding,
3108 char *lower,
3109 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003111 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003112 char *l;
3113 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003114 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115
Victor Stinner942889a2016-09-05 15:40:10 -07003116 assert(encoding != NULL);
3117
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003118 e = encoding;
3119 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003120 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003121 punct = 0;
3122 while (1) {
3123 char c = *e;
3124 if (c == 0) {
3125 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 }
Victor Stinner942889a2016-09-05 15:40:10 -07003127
3128 if (Py_ISALNUM(c) || c == '.') {
3129 if (punct && l != lower) {
3130 if (l == l_end) {
3131 return 0;
3132 }
3133 *l++ = '_';
3134 }
3135 punct = 0;
3136
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003143 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003144 }
Victor Stinner942889a2016-09-05 15:40:10 -07003145
3146 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003149 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 Py_ssize_t size,
3155 const char *encoding,
3156 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003157{
3158 PyObject *buffer = NULL, *unicode;
3159 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003160 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161
3162 if (encoding == NULL) {
3163 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003165
Fred Drakee4315f52000-05-09 19:53:39 +00003166 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003167 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168 char *lower = buflower;
3169
3170 /* Fast paths */
3171 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172 lower += 3;
3173 if (*lower == '_') {
3174 /* Match "utf8" and "utf_8" */
3175 lower++;
3176 }
3177
3178 if (lower[0] == '8' && lower[1] == 0) {
3179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180 }
3181 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183 }
3184 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186 }
3187 }
3188 else {
3189 if (strcmp(lower, "ascii") == 0
3190 || strcmp(lower, "us_ascii") == 0) {
3191 return PyUnicode_DecodeASCII(s, size, errors);
3192 }
3193 #ifdef HAVE_MBCS
3194 else if (strcmp(lower, "mbcs") == 0) {
3195 return PyUnicode_DecodeMBCS(s, size, errors);
3196 }
3197 #endif
3198 else if (strcmp(lower, "latin1") == 0
3199 || strcmp(lower, "latin_1") == 0
3200 || strcmp(lower, "iso_8859_1") == 0
3201 || strcmp(lower, "iso8859_1") == 0) {
3202 return PyUnicode_DecodeLatin1(s, size, errors);
3203 }
3204 }
Victor Stinner37296e82010-06-10 13:36:23 +00003205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
3207 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003208 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003210 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003211 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (buffer == NULL)
3213 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003214 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 if (unicode == NULL)
3216 goto onError;
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003219 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220 "use codecs.decode() to decode to arbitrary types",
3221 encoding,
3222 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_DECREF(unicode);
3224 goto onError;
3225 }
3226 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003227 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003228
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 Py_XDECREF(buffer);
3231 return NULL;
3232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
3235PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003236 const char *encoding,
3237 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238{
3239 PyObject *v;
3240
3241 if (!PyUnicode_Check(unicode)) {
3242 PyErr_BadArgument();
3243 goto onError;
3244 }
3245
3246 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248
3249 /* Decode via the codec registry */
3250 v = PyCodec_Decode(unicode, encoding, errors);
3251 if (v == NULL)
3252 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003253 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003254
Benjamin Peterson29060642009-01-31 22:14:21 +00003255 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003256 return NULL;
3257}
3258
Alexander Belopolsky40018472011-02-26 01:02:56 +00003259PyObject *
3260PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003261 const char *encoding,
3262 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003263{
3264 PyObject *v;
3265
3266 if (!PyUnicode_Check(unicode)) {
3267 PyErr_BadArgument();
3268 goto onError;
3269 }
3270
3271 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003272 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003273
3274 /* Decode via the codec registry */
3275 v = PyCodec_Decode(unicode, encoding, errors);
3276 if (v == NULL)
3277 goto onError;
3278 if (!PyUnicode_Check(v)) {
3279 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003280 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3281 "use codecs.decode() to decode to arbitrary types",
3282 encoding,
3283 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 Py_DECREF(v);
3285 goto onError;
3286 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003287 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003288
Benjamin Peterson29060642009-01-31 22:14:21 +00003289 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290 return NULL;
3291}
3292
Alexander Belopolsky40018472011-02-26 01:02:56 +00003293PyObject *
3294PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003295 Py_ssize_t size,
3296 const char *encoding,
3297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298{
3299 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003300
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 unicode = PyUnicode_FromUnicode(s, size);
3302 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3305 Py_DECREF(unicode);
3306 return v;
3307}
3308
Alexander Belopolsky40018472011-02-26 01:02:56 +00003309PyObject *
3310PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003311 const char *encoding,
3312 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003313{
3314 PyObject *v;
3315
3316 if (!PyUnicode_Check(unicode)) {
3317 PyErr_BadArgument();
3318 goto onError;
3319 }
3320
3321 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003323
3324 /* Encode via the codec registry */
3325 v = PyCodec_Encode(unicode, encoding, errors);
3326 if (v == NULL)
3327 goto onError;
3328 return v;
3329
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003331 return NULL;
3332}
3333
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003334static size_t
3335wcstombs_errorpos(const wchar_t *wstr)
3336{
3337 size_t len;
3338#if SIZEOF_WCHAR_T == 2
3339 wchar_t buf[3];
3340#else
3341 wchar_t buf[2];
3342#endif
3343 char outbuf[MB_LEN_MAX];
3344 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003345
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003346#if SIZEOF_WCHAR_T == 2
3347 buf[2] = 0;
3348#else
3349 buf[1] = 0;
3350#endif
3351 start = wstr;
3352 while (*wstr != L'\0')
3353 {
3354 previous = wstr;
3355#if SIZEOF_WCHAR_T == 2
3356 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3357 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3358 {
3359 buf[0] = wstr[0];
3360 buf[1] = wstr[1];
3361 wstr += 2;
3362 }
3363 else {
3364 buf[0] = *wstr;
3365 buf[1] = 0;
3366 wstr++;
3367 }
3368#else
3369 buf[0] = *wstr;
3370 wstr++;
3371#endif
3372 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003373 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003374 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003375 }
3376
3377 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 return 0;
3379}
3380
Victor Stinner1b579672011-12-17 05:47:23 +01003381static int
3382locale_error_handler(const char *errors, int *surrogateescape)
3383{
Victor Stinner50149202015-09-22 00:26:54 +02003384 _Py_error_handler error_handler = get_error_handler(errors);
3385 switch (error_handler)
3386 {
3387 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003388 *surrogateescape = 0;
3389 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003390 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003391 *surrogateescape = 1;
3392 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003393 default:
3394 PyErr_Format(PyExc_ValueError,
3395 "only 'strict' and 'surrogateescape' error handlers "
3396 "are supported, not '%s'",
3397 errors);
3398 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003399 }
Victor Stinner1b579672011-12-17 05:47:23 +01003400}
3401
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003402PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003403PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003404{
3405 Py_ssize_t wlen, wlen2;
3406 wchar_t *wstr;
3407 PyObject *bytes = NULL;
3408 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003409 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410 PyObject *exc;
3411 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003412 int surrogateescape;
3413
3414 if (locale_error_handler(errors, &surrogateescape) < 0)
3415 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416
3417 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3418 if (wstr == NULL)
3419 return NULL;
3420
3421 wlen2 = wcslen(wstr);
3422 if (wlen2 != wlen) {
3423 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003424 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 return NULL;
3426 }
3427
3428 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003429 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003430 char *str;
3431
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003432 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433 if (str == NULL) {
3434 if (error_pos == (size_t)-1) {
3435 PyErr_NoMemory();
3436 PyMem_Free(wstr);
3437 return NULL;
3438 }
3439 else {
3440 goto encode_error;
3441 }
3442 }
3443 PyMem_Free(wstr);
3444
3445 bytes = PyBytes_FromString(str);
3446 PyMem_Free(str);
3447 }
3448 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003449 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 size_t len, len2;
3451
3452 len = wcstombs(NULL, wstr, 0);
3453 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003454 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455 goto encode_error;
3456 }
3457
3458 bytes = PyBytes_FromStringAndSize(NULL, len);
3459 if (bytes == NULL) {
3460 PyMem_Free(wstr);
3461 return NULL;
3462 }
3463
3464 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3465 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003466 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 goto encode_error;
3468 }
3469 PyMem_Free(wstr);
3470 }
3471 return bytes;
3472
3473encode_error:
3474 errmsg = strerror(errno);
3475 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003476
3477 if (error_pos == (size_t)-1)
3478 error_pos = wcstombs_errorpos(wstr);
3479
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003480 PyMem_Free(wstr);
3481 Py_XDECREF(bytes);
3482
Victor Stinner2f197072011-12-17 07:08:30 +01003483 if (errmsg != NULL) {
3484 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003485 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003486 if (wstr != NULL) {
3487 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003488 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003489 } else
3490 errmsg = NULL;
3491 }
3492 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003493 reason = PyUnicode_FromString(
3494 "wcstombs() encountered an unencodable "
3495 "wide character");
3496 if (reason == NULL)
3497 return NULL;
3498
3499 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3500 "locale", unicode,
3501 (Py_ssize_t)error_pos,
3502 (Py_ssize_t)(error_pos+1),
3503 reason);
3504 Py_DECREF(reason);
3505 if (exc != NULL) {
3506 PyCodec_StrictErrors(exc);
3507 Py_XDECREF(exc);
3508 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003509 return NULL;
3510}
3511
Victor Stinnerad158722010-10-27 00:25:46 +00003512PyObject *
3513PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003514{
Victor Stinner99b95382011-07-04 14:23:54 +02003515#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003516 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003517#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003519#else
Victor Stinner793b5312011-04-27 00:24:21 +02003520 PyInterpreterState *interp = PyThreadState_GET()->interp;
3521 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3522 cannot use it to encode and decode filenames before it is loaded. Load
3523 the Python codec requires to encode at least its own filename. Use the C
3524 version of the locale codec until the codec registry is initialized and
3525 the Python codec is loaded.
3526
3527 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3528 cannot only rely on it: check also interp->fscodec_initialized for
3529 subinterpreters. */
3530 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003531 return PyUnicode_AsEncodedString(unicode,
3532 Py_FileSystemDefaultEncoding,
3533 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003534 }
3535 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003536 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003537 }
Victor Stinnerad158722010-10-27 00:25:46 +00003538#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003539}
3540
Alexander Belopolsky40018472011-02-26 01:02:56 +00003541PyObject *
3542PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003543 const char *encoding,
3544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545{
3546 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003547 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003548
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 if (!PyUnicode_Check(unicode)) {
3550 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 }
Fred Drakee4315f52000-05-09 19:53:39 +00003553
Victor Stinner942889a2016-09-05 15:40:10 -07003554 if (encoding == NULL) {
3555 return _PyUnicode_AsUTF8String(unicode, errors);
3556 }
3557
Fred Drakee4315f52000-05-09 19:53:39 +00003558 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003559 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3560 char *lower = buflower;
3561
3562 /* Fast paths */
3563 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3564 lower += 3;
3565 if (*lower == '_') {
3566 /* Match "utf8" and "utf_8" */
3567 lower++;
3568 }
3569
3570 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003571 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003572 }
3573 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3574 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3575 }
3576 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3577 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3578 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003579 }
Victor Stinner942889a2016-09-05 15:40:10 -07003580 else {
3581 if (strcmp(lower, "ascii") == 0
3582 || strcmp(lower, "us_ascii") == 0) {
3583 return _PyUnicode_AsASCIIString(unicode, errors);
3584 }
Victor Stinner99b95382011-07-04 14:23:54 +02003585#ifdef HAVE_MBCS
Victor Stinner942889a2016-09-05 15:40:10 -07003586 else if (strcmp(lower, "mbcs") == 0) {
3587 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3588 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003589#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003590 else if (strcmp(lower, "latin1") == 0 ||
3591 strcmp(lower, "latin_1") == 0 ||
3592 strcmp(lower, "iso_8859_1") == 0 ||
3593 strcmp(lower, "iso8859_1") == 0) {
3594 return _PyUnicode_AsLatin1String(unicode, errors);
3595 }
3596 }
Victor Stinner37296e82010-06-10 13:36:23 +00003597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598
3599 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003600 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003602 return NULL;
3603
3604 /* The normal path */
3605 if (PyBytes_Check(v))
3606 return v;
3607
3608 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003609 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003610 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003611 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003612
3613 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003614 "encoder %s returned bytearray instead of bytes; "
3615 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003616 encoding);
3617 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003618 Py_DECREF(v);
3619 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003620 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003621
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003622 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3623 Py_DECREF(v);
3624 return b;
3625 }
3626
3627 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003628 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3629 "use codecs.encode() to encode to arbitrary types",
3630 encoding,
3631 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003632 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003633 return NULL;
3634}
3635
Alexander Belopolsky40018472011-02-26 01:02:56 +00003636PyObject *
3637PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003638 const char *encoding,
3639 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003640{
3641 PyObject *v;
3642
3643 if (!PyUnicode_Check(unicode)) {
3644 PyErr_BadArgument();
3645 goto onError;
3646 }
3647
3648 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003650
3651 /* Encode via the codec registry */
3652 v = PyCodec_Encode(unicode, encoding, errors);
3653 if (v == NULL)
3654 goto onError;
3655 if (!PyUnicode_Check(v)) {
3656 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003657 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3658 "use codecs.encode() to encode to arbitrary types",
3659 encoding,
3660 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661 Py_DECREF(v);
3662 goto onError;
3663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003665
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 return NULL;
3668}
3669
Victor Stinner2f197072011-12-17 07:08:30 +01003670static size_t
3671mbstowcs_errorpos(const char *str, size_t len)
3672{
3673#ifdef HAVE_MBRTOWC
3674 const char *start = str;
3675 mbstate_t mbs;
3676 size_t converted;
3677 wchar_t ch;
3678
3679 memset(&mbs, 0, sizeof mbs);
3680 while (len)
3681 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003682 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003683 if (converted == 0)
3684 /* Reached end of string */
3685 break;
3686 if (converted == (size_t)-1 || converted == (size_t)-2) {
3687 /* Conversion error or incomplete character */
3688 return str - start;
3689 }
3690 else {
3691 str += converted;
3692 len -= converted;
3693 }
3694 }
3695 /* failed to find the undecodable byte sequence */
3696 return 0;
3697#endif
3698 return 0;
3699}
3700
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003701PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003702PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003703 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003704{
3705 wchar_t smallbuf[256];
3706 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3707 wchar_t *wstr;
3708 size_t wlen, wlen2;
3709 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003710 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003711 size_t error_pos;
3712 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003713 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3714 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003715
3716 if (locale_error_handler(errors, &surrogateescape) < 0)
3717 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003718
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003719 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3720 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721 return NULL;
3722 }
3723
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003724 if (surrogateescape) {
3725 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003726 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003727 if (wstr == NULL) {
3728 if (wlen == (size_t)-1)
3729 PyErr_NoMemory();
3730 else
3731 PyErr_SetFromErrno(PyExc_OSError);
3732 return NULL;
3733 }
3734
3735 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003736 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003737 }
3738 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003739 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003740#ifndef HAVE_BROKEN_MBSTOWCS
3741 wlen = mbstowcs(NULL, str, 0);
3742#else
3743 wlen = len;
3744#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003745 if (wlen == (size_t)-1)
3746 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003747 if (wlen+1 <= smallbuf_len) {
3748 wstr = smallbuf;
3749 }
3750 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003751 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003752 if (!wstr)
3753 return PyErr_NoMemory();
3754 }
3755
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003756 wlen2 = mbstowcs(wstr, str, wlen+1);
3757 if (wlen2 == (size_t)-1) {
3758 if (wstr != smallbuf)
3759 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003760 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003761 }
3762#ifdef HAVE_BROKEN_MBSTOWCS
3763 assert(wlen2 == wlen);
3764#endif
3765 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3766 if (wstr != smallbuf)
3767 PyMem_Free(wstr);
3768 }
3769 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003770
3771decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003772 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003773 errmsg = strerror(errno);
3774 assert(errmsg != NULL);
3775
3776 error_pos = mbstowcs_errorpos(str, len);
3777 if (errmsg != NULL) {
3778 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003779 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003780 if (wstr != NULL) {
3781 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003782 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003783 }
Victor Stinner2f197072011-12-17 07:08:30 +01003784 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003785 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003786 reason = PyUnicode_FromString(
3787 "mbstowcs() encountered an invalid multibyte sequence");
3788 if (reason == NULL)
3789 return NULL;
3790
3791 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3792 "locale", str, len,
3793 (Py_ssize_t)error_pos,
3794 (Py_ssize_t)(error_pos+1),
3795 reason);
3796 Py_DECREF(reason);
3797 if (exc != NULL) {
3798 PyCodec_StrictErrors(exc);
3799 Py_XDECREF(exc);
3800 }
3801 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003802}
3803
3804PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003805PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003806{
3807 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003808 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003809}
3810
3811
3812PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003813PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003814 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003815 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3816}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003817
Christian Heimes5894ba72007-11-04 11:43:14 +00003818PyObject*
3819PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3820{
Victor Stinner99b95382011-07-04 14:23:54 +02003821#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003822 return PyUnicode_DecodeMBCS(s, size, NULL);
3823#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003824 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003825#else
Victor Stinner793b5312011-04-27 00:24:21 +02003826 PyInterpreterState *interp = PyThreadState_GET()->interp;
3827 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3828 cannot use it to encode and decode filenames before it is loaded. Load
3829 the Python codec requires to encode at least its own filename. Use the C
3830 version of the locale codec until the codec registry is initialized and
3831 the Python codec is loaded.
3832
3833 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3834 cannot only rely on it: check also interp->fscodec_initialized for
3835 subinterpreters. */
3836 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 return PyUnicode_Decode(s, size,
3838 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003839 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003840 }
3841 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003842 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003843 }
Victor Stinnerad158722010-10-27 00:25:46 +00003844#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845}
3846
Martin v. Löwis011e8422009-05-05 04:43:17 +00003847
3848int
3849PyUnicode_FSConverter(PyObject* arg, void* addr)
3850{
3851 PyObject *output = NULL;
3852 Py_ssize_t size;
3853 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003854 if (arg == NULL) {
3855 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003856 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003857 return 1;
3858 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003859 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003860 output = arg;
3861 Py_INCREF(output);
3862 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003863 else if (PyUnicode_Check(arg)) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003864 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003865 if (!output)
3866 return 0;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003867 assert(PyBytes_Check(output));
3868 }
3869 else {
3870 PyErr_Format(PyExc_TypeError,
3871 "must be str or bytes, not %.100s",
3872 Py_TYPE(arg)->tp_name);
3873 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003874 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003875 size = PyBytes_GET_SIZE(output);
3876 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003877 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003878 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003879 Py_DECREF(output);
3880 return 0;
3881 }
3882 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003883 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003884}
3885
3886
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003887int
3888PyUnicode_FSDecoder(PyObject* arg, void* addr)
3889{
3890 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003891 if (arg == NULL) {
3892 Py_DECREF(*(PyObject**)addr);
3893 return 1;
3894 }
3895 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003896 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003898 output = arg;
3899 Py_INCREF(output);
3900 }
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003901 else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) {
3902 if (!PyBytes_Check(arg) &&
3903 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3904 "path should be string or bytes, not %.200s",
3905 Py_TYPE(arg)->tp_name)) {
3906 return 0;
3907 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003908 arg = PyBytes_FromObject(arg);
3909 if (!arg)
3910 return 0;
3911 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3912 PyBytes_GET_SIZE(arg));
3913 Py_DECREF(arg);
3914 if (!output)
3915 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003916 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003917 else {
3918 PyErr_Format(PyExc_TypeError,
3919 "path should be string or bytes, not %.200s",
3920 Py_TYPE(arg)->tp_name);
3921 return 0;
3922 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003923 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003924 Py_DECREF(output);
3925 return 0;
3926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003928 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003929 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003930 Py_DECREF(output);
3931 return 0;
3932 }
3933 *(PyObject**)addr = output;
3934 return Py_CLEANUP_SUPPORTED;
3935}
3936
3937
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003940{
Christian Heimesf3863112007-11-22 07:46:41 +00003941 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003943 if (!PyUnicode_Check(unicode)) {
3944 PyErr_BadArgument();
3945 return NULL;
3946 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003947 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003948 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003950 if (PyUnicode_UTF8(unicode) == NULL) {
3951 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003952 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953 if (bytes == NULL)
3954 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3956 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003957 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958 Py_DECREF(bytes);
3959 return NULL;
3960 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3962 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3963 PyBytes_AS_STRING(bytes),
3964 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 Py_DECREF(bytes);
3966 }
3967
3968 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003969 *psize = PyUnicode_UTF8_LENGTH(unicode);
3970 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003971}
3972
3973char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3977}
3978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979Py_UNICODE *
3980PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 const unsigned char *one_byte;
3983#if SIZEOF_WCHAR_T == 4
3984 const Py_UCS2 *two_bytes;
3985#else
3986 const Py_UCS4 *four_bytes;
3987 const Py_UCS4 *ucs4_end;
3988 Py_ssize_t num_surrogates;
3989#endif
3990 wchar_t *w;
3991 wchar_t *wchar_end;
3992
3993 if (!PyUnicode_Check(unicode)) {
3994 PyErr_BadArgument();
3995 return NULL;
3996 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 assert(_PyUnicode_KIND(unicode) != 0);
4000 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004002 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004004 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4005 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 num_surrogates = 0;
4007
4008 for (; four_bytes < ucs4_end; ++four_bytes) {
4009 if (*four_bytes > 0xFFFF)
4010 ++num_surrogates;
4011 }
4012
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004013 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4014 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4015 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 PyErr_NoMemory();
4017 return NULL;
4018 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004019 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004021 w = _PyUnicode_WSTR(unicode);
4022 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4023 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4025 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004026 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004028 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4029 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 }
4031 else
4032 *w = *four_bytes;
4033
4034 if (w > wchar_end) {
4035 assert(0 && "Miscalculated string end");
4036 }
4037 }
4038 *w = 0;
4039#else
4040 /* sizeof(wchar_t) == 4 */
4041 Py_FatalError("Impossible unicode object state, wstr and str "
4042 "should share memory already.");
4043 return NULL;
4044#endif
4045 }
4046 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004047 if ((size_t)_PyUnicode_LENGTH(unicode) >
4048 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4049 PyErr_NoMemory();
4050 return NULL;
4051 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004052 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4053 (_PyUnicode_LENGTH(unicode) + 1));
4054 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 PyErr_NoMemory();
4056 return NULL;
4057 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004058 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4059 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4060 w = _PyUnicode_WSTR(unicode);
4061 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004063 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4064 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 for (; w < wchar_end; ++one_byte, ++w)
4066 *w = *one_byte;
4067 /* null-terminate the wstr */
4068 *w = 0;
4069 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004070 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004072 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073 for (; w < wchar_end; ++two_bytes, ++w)
4074 *w = *two_bytes;
4075 /* null-terminate the wstr */
4076 *w = 0;
4077#else
4078 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004079 PyObject_FREE(_PyUnicode_WSTR(unicode));
4080 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081 Py_FatalError("Impossible unicode object state, wstr "
4082 "and str should share memory already.");
4083 return NULL;
4084#endif
4085 }
4086 else {
4087 assert(0 && "This should never happen.");
4088 }
4089 }
4090 }
4091 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 *size = PyUnicode_WSTR_LENGTH(unicode);
4093 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004094}
4095
Alexander Belopolsky40018472011-02-26 01:02:56 +00004096Py_UNICODE *
4097PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100}
4101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102
Alexander Belopolsky40018472011-02-26 01:02:56 +00004103Py_ssize_t
4104PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105{
4106 if (!PyUnicode_Check(unicode)) {
4107 PyErr_BadArgument();
4108 goto onError;
4109 }
4110 return PyUnicode_GET_SIZE(unicode);
4111
Benjamin Peterson29060642009-01-31 22:14:21 +00004112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 return -1;
4114}
4115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116Py_ssize_t
4117PyUnicode_GetLength(PyObject *unicode)
4118{
Victor Stinner07621332012-06-16 04:53:46 +02004119 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 PyErr_BadArgument();
4121 return -1;
4122 }
Victor Stinner07621332012-06-16 04:53:46 +02004123 if (PyUnicode_READY(unicode) == -1)
4124 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 return PyUnicode_GET_LENGTH(unicode);
4126}
4127
4128Py_UCS4
4129PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4130{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004131 void *data;
4132 int kind;
4133
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004134 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4135 PyErr_BadArgument();
4136 return (Py_UCS4)-1;
4137 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004138 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004139 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140 return (Py_UCS4)-1;
4141 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004142 data = PyUnicode_DATA(unicode);
4143 kind = PyUnicode_KIND(unicode);
4144 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004145}
4146
4147int
4148PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4149{
4150 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004151 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152 return -1;
4153 }
Victor Stinner488fa492011-12-12 00:01:39 +01004154 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004155 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004156 PyErr_SetString(PyExc_IndexError, "string index out of range");
4157 return -1;
4158 }
Victor Stinner488fa492011-12-12 00:01:39 +01004159 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004160 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004161 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4162 PyErr_SetString(PyExc_ValueError, "character out of range");
4163 return -1;
4164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4166 index, ch);
4167 return 0;
4168}
4169
Alexander Belopolsky40018472011-02-26 01:02:56 +00004170const char *
4171PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004172{
Victor Stinner42cb4622010-09-01 19:39:01 +00004173 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004174}
4175
Victor Stinner554f3f02010-06-16 23:33:54 +00004176/* create or adjust a UnicodeDecodeError */
4177static void
4178make_decode_exception(PyObject **exceptionObject,
4179 const char *encoding,
4180 const char *input, Py_ssize_t length,
4181 Py_ssize_t startpos, Py_ssize_t endpos,
4182 const char *reason)
4183{
4184 if (*exceptionObject == NULL) {
4185 *exceptionObject = PyUnicodeDecodeError_Create(
4186 encoding, input, length, startpos, endpos, reason);
4187 }
4188 else {
4189 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4190 goto onError;
4191 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4192 goto onError;
4193 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4194 goto onError;
4195 }
4196 return;
4197
4198onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004199 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004200}
4201
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203/* error handling callback helper:
4204 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004205 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 and adjust various state variables.
4207 return 0 on success, -1 on error
4208*/
4209
Alexander Belopolsky40018472011-02-26 01:02:56 +00004210static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004211unicode_decode_call_errorhandler_wchar(
4212 const char *errors, PyObject **errorHandler,
4213 const char *encoding, const char *reason,
4214 const char **input, const char **inend, Py_ssize_t *startinpos,
4215 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4216 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004218 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219
4220 PyObject *restuple = NULL;
4221 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004222 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004223 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004224 Py_ssize_t requiredsize;
4225 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004226 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004227 wchar_t *repwstr;
4228 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004230 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4231 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 *errorHandler = PyCodec_LookupError(errors);
4235 if (*errorHandler == NULL)
4236 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 }
4238
Victor Stinner554f3f02010-06-16 23:33:54 +00004239 make_decode_exception(exceptionObject,
4240 encoding,
4241 *input, *inend - *input,
4242 *startinpos, *endinpos,
4243 reason);
4244 if (*exceptionObject == NULL)
4245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246
4247 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4248 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004251 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 }
4254 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004256
4257 /* Copy back the bytes variables, which might have been modified by the
4258 callback */
4259 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4260 if (!inputobj)
4261 goto onError;
4262 if (!PyBytes_Check(inputobj)) {
4263 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4264 }
4265 *input = PyBytes_AS_STRING(inputobj);
4266 insize = PyBytes_GET_SIZE(inputobj);
4267 *inend = *input + insize;
4268 /* we can DECREF safely, as the exception has another reference,
4269 so the object won't go away. */
4270 Py_DECREF(inputobj);
4271
4272 if (newpos<0)
4273 newpos = insize+newpos;
4274 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004275 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004276 goto onError;
4277 }
4278
4279 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4280 if (repwstr == NULL)
4281 goto onError;
4282 /* need more space? (at least enough for what we
4283 have+the replacement+the rest of the string (starting
4284 at the new input position), so we won't have to check space
4285 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004286 requiredsize = *outpos;
4287 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4288 goto overflow;
4289 requiredsize += repwlen;
4290 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4291 goto overflow;
4292 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004293 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004294 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 requiredsize = 2*outsize;
4296 if (unicode_resize(output, requiredsize) < 0)
4297 goto onError;
4298 }
4299 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4300 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 *endinpos = newpos;
4302 *inptr = *input + newpos;
4303
4304 /* we made it! */
4305 Py_XDECREF(restuple);
4306 return 0;
4307
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004308 overflow:
4309 PyErr_SetString(PyExc_OverflowError,
4310 "decoded result is too long for a Python string");
4311
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 onError:
4313 Py_XDECREF(restuple);
4314 return -1;
4315}
4316#endif /* HAVE_MBCS */
4317
4318static int
4319unicode_decode_call_errorhandler_writer(
4320 const char *errors, PyObject **errorHandler,
4321 const char *encoding, const char *reason,
4322 const char **input, const char **inend, Py_ssize_t *startinpos,
4323 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4324 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4325{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004326 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327
4328 PyObject *restuple = NULL;
4329 PyObject *repunicode = NULL;
4330 Py_ssize_t insize;
4331 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004332 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 PyObject *inputobj = NULL;
4334
4335 if (*errorHandler == NULL) {
4336 *errorHandler = PyCodec_LookupError(errors);
4337 if (*errorHandler == NULL)
4338 goto onError;
4339 }
4340
4341 make_decode_exception(exceptionObject,
4342 encoding,
4343 *input, *inend - *input,
4344 *startinpos, *endinpos,
4345 reason);
4346 if (*exceptionObject == NULL)
4347 goto onError;
4348
4349 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4350 if (restuple == NULL)
4351 goto onError;
4352 if (!PyTuple_Check(restuple)) {
4353 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4354 goto onError;
4355 }
4356 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004358
4359 /* Copy back the bytes variables, which might have been modified by the
4360 callback */
4361 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4362 if (!inputobj)
4363 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004364 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004366 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004367 *input = PyBytes_AS_STRING(inputobj);
4368 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004369 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004370 /* we can DECREF safely, as the exception has another reference,
4371 so the object won't go away. */
4372 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004376 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004377 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004379 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380
Victor Stinner8f674cc2013-04-17 23:02:17 +02004381 if (PyUnicode_READY(repunicode) < 0)
4382 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004383 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004384 if (replen > 1) {
4385 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004386 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004387 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4388 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4389 goto onError;
4390 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004391 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004392 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004395 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398 Py_XDECREF(restuple);
4399 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404}
4405
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406/* --- UTF-7 Codec -------------------------------------------------------- */
4407
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408/* See RFC2152 for details. We encode conservatively and decode liberally. */
4409
4410/* Three simple macros defining base-64. */
4411
4412/* Is c a base-64 character? */
4413
4414#define IS_BASE64(c) \
4415 (((c) >= 'A' && (c) <= 'Z') || \
4416 ((c) >= 'a' && (c) <= 'z') || \
4417 ((c) >= '0' && (c) <= '9') || \
4418 (c) == '+' || (c) == '/')
4419
4420/* given that c is a base-64 character, what is its base-64 value? */
4421
4422#define FROM_BASE64(c) \
4423 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4424 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4425 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4426 (c) == '+' ? 62 : 63)
4427
4428/* What is the base-64 character of the bottom 6 bits of n? */
4429
4430#define TO_BASE64(n) \
4431 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4432
4433/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4434 * decoded as itself. We are permissive on decoding; the only ASCII
4435 * byte not decoding to itself is the + which begins a base64
4436 * string. */
4437
4438#define DECODE_DIRECT(c) \
4439 ((c) <= 127 && (c) != '+')
4440
4441/* The UTF-7 encoder treats ASCII characters differently according to
4442 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4443 * the above). See RFC2152. This array identifies these different
4444 * sets:
4445 * 0 : "Set D"
4446 * alphanumeric and '(),-./:?
4447 * 1 : "Set O"
4448 * !"#$%&*;<=>@[]^_`{|}
4449 * 2 : "whitespace"
4450 * ht nl cr sp
4451 * 3 : special (must be base64 encoded)
4452 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4453 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454
Tim Petersced69f82003-09-16 20:30:58 +00004455static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456char utf7_category[128] = {
4457/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4458 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4459/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4460 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4461/* sp ! " # $ % & ' ( ) * + , - . / */
4462 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4463/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4465/* @ A B C D E F G H I J K L M N O */
4466 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4467/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4469/* ` a b c d e f g h i j k l m n o */
4470 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4471/* p q r s t u v w x y z { | } ~ del */
4472 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473};
4474
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475/* ENCODE_DIRECT: this character should be encoded as itself. The
4476 * answer depends on whether we are encoding set O as itself, and also
4477 * on whether we are encoding whitespace as itself. RFC2152 makes it
4478 * clear that the answers to these questions vary between
4479 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004480
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481#define ENCODE_DIRECT(c, directO, directWS) \
4482 ((c) < 128 && (c) > 0 && \
4483 ((utf7_category[(c)] == 0) || \
4484 (directWS && (utf7_category[(c)] == 2)) || \
4485 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486
Alexander Belopolsky40018472011-02-26 01:02:56 +00004487PyObject *
4488PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004489 Py_ssize_t size,
4490 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004492 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4493}
4494
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495/* The decoder. The only state we preserve is our read position,
4496 * i.e. how many characters we have consumed. So if we end in the
4497 * middle of a shift sequence we have to back off the read position
4498 * and the output to the beginning of the sequence, otherwise we lose
4499 * all the shift state (seen bits, number of bits seen, high
4500 * surrogate). */
4501
Alexander Belopolsky40018472011-02-26 01:02:56 +00004502PyObject *
4503PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004504 Py_ssize_t size,
4505 const char *errors,
4506 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004509 Py_ssize_t startinpos;
4510 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004512 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 const char *errmsg = "";
4514 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004515 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 unsigned int base64bits = 0;
4517 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004518 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 PyObject *errorHandler = NULL;
4520 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522 if (size == 0) {
4523 if (consumed)
4524 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004525 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004526 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004528 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004529 _PyUnicodeWriter_Init(&writer);
4530 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531
4532 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533 e = s + size;
4534
4535 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004536 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004538 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004539
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 if (inShift) { /* in a base-64 section */
4541 if (IS_BASE64(ch)) { /* consume a base-64 character */
4542 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4543 base64bits += 6;
4544 s++;
4545 if (base64bits >= 16) {
4546 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004547 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 base64bits -= 16;
4549 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004550 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 if (surrogate) {
4552 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004553 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4554 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004555 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004556 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004558 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 }
4560 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004561 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004562 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
4565 }
Victor Stinner551ac952011-11-29 22:58:13 +01004566 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 /* first surrogate */
4568 surrogate = outCh;
4569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004571 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004572 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
4574 }
4575 }
4576 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 if (base64bits > 0) { /* left-over bits */
4579 if (base64bits >= 6) {
4580 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004581 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 errmsg = "partial character in shift sequence";
4583 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 else {
4586 /* Some bits remain; they should be zero */
4587 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004588 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 errmsg = "non-zero padding bits in shift sequence";
4590 goto utf7Error;
4591 }
4592 }
4593 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004594 if (surrogate && DECODE_DIRECT(ch)) {
4595 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4596 goto onError;
4597 }
4598 surrogate = 0;
4599 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* '-' is absorbed; other terminating
4601 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004602 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604 }
4605 }
4606 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 s++; /* consume '+' */
4609 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004611 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004612 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 }
4614 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004616 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004617 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004619 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
4621 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004624 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004625 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 else {
4628 startinpos = s-starts;
4629 s++;
4630 errmsg = "unexpected special character";
4631 goto utf7Error;
4632 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004636 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 errors, &errorHandler,
4638 "utf7", errmsg,
4639 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004640 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 }
4643
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 /* end of string */
4645
4646 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4647 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004648 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 if (surrogate ||
4650 (base64bits >= 6) ||
4651 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004653 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 errors, &errorHandler,
4655 "utf7", "unterminated shift sequence",
4656 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004657 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 goto onError;
4659 if (s < e)
4660 goto restart;
4661 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663
4664 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004665 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004667 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004668 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004669 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004670 writer.kind, writer.data, shiftOutStart);
4671 Py_XDECREF(errorHandler);
4672 Py_XDECREF(exc);
4673 _PyUnicodeWriter_Dealloc(&writer);
4674 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004675 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004676 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 }
4678 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004679 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004681 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683 Py_XDECREF(errorHandler);
4684 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004686
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 Py_XDECREF(errorHandler);
4689 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 return NULL;
4692}
4693
4694
Alexander Belopolsky40018472011-02-26 01:02:56 +00004695PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004696_PyUnicode_EncodeUTF7(PyObject *str,
4697 int base64SetO,
4698 int base64WhiteSpace,
4699 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004700{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004701 int kind;
4702 void *data;
4703 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004704 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004706 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 unsigned int base64bits = 0;
4708 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709 char * out;
4710 char * start;
4711
Benjamin Petersonbac79492012-01-14 13:34:47 -05004712 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004713 return NULL;
4714 kind = PyUnicode_KIND(str);
4715 data = PyUnicode_DATA(str);
4716 len = PyUnicode_GET_LENGTH(str);
4717
4718 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004721 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004722 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004723 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004724 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725 if (v == NULL)
4726 return NULL;
4727
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004728 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004729 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004730 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731
Antoine Pitrou244651a2009-05-04 18:56:13 +00004732 if (inShift) {
4733 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4734 /* shifting out */
4735 if (base64bits) { /* output remaining bits */
4736 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4737 base64buffer = 0;
4738 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 }
4740 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 /* Characters not in the BASE64 set implicitly unshift the sequence
4742 so no '-' is required, except if the character is itself a '-' */
4743 if (IS_BASE64(ch) || ch == '-') {
4744 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 *out++ = (char) ch;
4747 }
4748 else {
4749 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 else { /* not in a shift sequence */
4753 if (ch == '+') {
4754 *out++ = '+';
4755 *out++ = '-';
4756 }
4757 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4758 *out++ = (char) ch;
4759 }
4760 else {
4761 *out++ = '+';
4762 inShift = 1;
4763 goto encode_char;
4764 }
4765 }
4766 continue;
4767encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004769 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004770
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771 /* code first surrogate */
4772 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004773 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 while (base64bits >= 6) {
4775 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4776 base64bits -= 6;
4777 }
4778 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004779 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004780 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 base64bits += 16;
4782 base64buffer = (base64buffer << 16) | ch;
4783 while (base64bits >= 6) {
4784 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4785 base64bits -= 6;
4786 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004787 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 if (base64bits)
4789 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4790 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004791 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004792 if (_PyBytes_Resize(&v, out - start) < 0)
4793 return NULL;
4794 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004795}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004796PyObject *
4797PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4798 Py_ssize_t size,
4799 int base64SetO,
4800 int base64WhiteSpace,
4801 const char *errors)
4802{
4803 PyObject *result;
4804 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4805 if (tmp == NULL)
4806 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004807 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004808 base64WhiteSpace, errors);
4809 Py_DECREF(tmp);
4810 return result;
4811}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004812
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813#undef IS_BASE64
4814#undef FROM_BASE64
4815#undef TO_BASE64
4816#undef DECODE_DIRECT
4817#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004818
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819/* --- UTF-8 Codec -------------------------------------------------------- */
4820
Alexander Belopolsky40018472011-02-26 01:02:56 +00004821PyObject *
4822PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004823 Py_ssize_t size,
4824 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825{
Walter Dörwald69652032004-09-07 20:24:22 +00004826 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4827}
4828
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829#include "stringlib/asciilib.h"
4830#include "stringlib/codecs.h"
4831#include "stringlib/undef.h"
4832
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004833#include "stringlib/ucs1lib.h"
4834#include "stringlib/codecs.h"
4835#include "stringlib/undef.h"
4836
4837#include "stringlib/ucs2lib.h"
4838#include "stringlib/codecs.h"
4839#include "stringlib/undef.h"
4840
4841#include "stringlib/ucs4lib.h"
4842#include "stringlib/codecs.h"
4843#include "stringlib/undef.h"
4844
Antoine Pitrouab868312009-01-10 15:40:25 +00004845/* Mask to quickly check whether a C 'long' contains a
4846 non-ASCII, UTF8-encoded char. */
4847#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004848# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004849#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004850# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004851#else
4852# error C 'long' size should be either 4 or 8!
4853#endif
4854
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855static Py_ssize_t
4856ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004859 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004860
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004861 /*
4862 * Issue #17237: m68k is a bit different from most architectures in
4863 * that objects do not use "natural alignment" - for example, int and
4864 * long are only aligned at 2-byte boundaries. Therefore the assert()
4865 * won't work; also, tests have shown that skipping the "optimised
4866 * version" will even speed up m68k.
4867 */
4868#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004870 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4871 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 /* Fast path, see in STRINGLIB(utf8_decode) for
4873 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004874 /* Help allocation */
4875 const char *_p = p;
4876 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877 while (_p < aligned_end) {
4878 unsigned long value = *(const unsigned long *) _p;
4879 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 *((unsigned long *)q) = value;
4882 _p += SIZEOF_LONG;
4883 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004884 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 p = _p;
4886 while (p < end) {
4887 if ((unsigned char)*p & 0x80)
4888 break;
4889 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004893#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004894#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 while (p < end) {
4896 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4897 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004898 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004899 /* Help allocation */
4900 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 while (_p < aligned_end) {
4902 unsigned long value = *(unsigned long *) _p;
4903 if (value & ASCII_CHAR_MASK)
4904 break;
4905 _p += SIZEOF_LONG;
4906 }
4907 p = _p;
4908 if (_p == end)
4909 break;
4910 }
4911 if ((unsigned char)*p & 0x80)
4912 break;
4913 ++p;
4914 }
4915 memcpy(dest, start, p - start);
4916 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917}
Antoine Pitrouab868312009-01-10 15:40:25 +00004918
Victor Stinner785938e2011-12-11 20:09:03 +01004919PyObject *
4920PyUnicode_DecodeUTF8Stateful(const char *s,
4921 Py_ssize_t size,
4922 const char *errors,
4923 Py_ssize_t *consumed)
4924{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004925 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004926 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928
4929 Py_ssize_t startinpos;
4930 Py_ssize_t endinpos;
4931 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004932 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004934 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004935
4936 if (size == 0) {
4937 if (consumed)
4938 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004939 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004940 }
4941
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4943 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004944 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 *consumed = 1;
4946 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004947 }
4948
Victor Stinner8f674cc2013-04-17 23:02:17 +02004949 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004950 writer.min_length = size;
4951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004952 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004953
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004954 writer.pos = ascii_decode(s, end, writer.data);
4955 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 while (s < end) {
4957 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004959
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004961 if (PyUnicode_IS_ASCII(writer.buffer))
4962 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004964 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004967 } else {
4968 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970 }
4971
4972 switch (ch) {
4973 case 0:
4974 if (s == end || consumed)
4975 goto End;
4976 errmsg = "unexpected end of data";
4977 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004978 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 break;
4980 case 1:
4981 errmsg = "invalid start byte";
4982 startinpos = s - starts;
4983 endinpos = startinpos + 1;
4984 break;
4985 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004986 case 3:
4987 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 errmsg = "invalid continuation byte";
4989 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004990 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 break;
4992 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004993 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 goto onError;
4995 continue;
4996 }
4997
Victor Stinner1d65d912015-10-05 13:43:50 +02004998 if (error_handler == _Py_ERROR_UNKNOWN)
4999 error_handler = get_error_handler(errors);
5000
5001 switch (error_handler) {
5002 case _Py_ERROR_IGNORE:
5003 s += (endinpos - startinpos);
5004 break;
5005
5006 case _Py_ERROR_REPLACE:
5007 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5008 goto onError;
5009 s += (endinpos - startinpos);
5010 break;
5011
5012 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005013 {
5014 Py_ssize_t i;
5015
Victor Stinner1d65d912015-10-05 13:43:50 +02005016 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5017 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005018 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005019 ch = (Py_UCS4)(unsigned char)(starts[i]);
5020 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5021 ch + 0xdc00);
5022 writer.pos++;
5023 }
5024 s += (endinpos - startinpos);
5025 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005026 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005027
5028 default:
5029 if (unicode_decode_call_errorhandler_writer(
5030 errors, &error_handler_obj,
5031 "utf-8", errmsg,
5032 &starts, &end, &startinpos, &endinpos, &exc, &s,
5033 &writer))
5034 goto onError;
5035 }
Victor Stinner785938e2011-12-11 20:09:03 +01005036 }
5037
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 if (consumed)
5040 *consumed = s - starts;
5041
Victor Stinner1d65d912015-10-05 13:43:50 +02005042 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005044 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005045
5046onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005047 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005048 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005049 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005051}
5052
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005053#ifdef __APPLE__
5054
5055/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005056 used to decode the command line arguments on Mac OS X.
5057
5058 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005059 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005060
5061wchar_t*
5062_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5063{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005064 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005065 wchar_t *unicode;
5066 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005067
5068 /* Note: size will always be longer than the resulting Unicode
5069 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005070 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005071 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005072 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005073 if (!unicode)
5074 return NULL;
5075
5076 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005077 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005079 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005081#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005083#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005085#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086 if (ch > 0xFF) {
5087#if SIZEOF_WCHAR_T == 4
5088 assert(0);
5089#else
5090 assert(Py_UNICODE_IS_SURROGATE(ch));
5091 /* compute and append the two surrogates: */
5092 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5093 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5094#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005095 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005096 else {
5097 if (!ch && s == e)
5098 break;
5099 /* surrogateescape */
5100 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5101 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104 return unicode;
5105}
5106
5107#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109/* Primary internal function which creates utf8 encoded bytes objects.
5110
5111 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005112 and allocate exactly as much space needed at the end. Else allocate the
5113 maximum possible needed (4 result bytes per Unicode character), and return
5114 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005115*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005116PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005117_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118{
Victor Stinner6099a032011-12-18 14:22:26 +01005119 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 void *data;
5121 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 if (!PyUnicode_Check(unicode)) {
5124 PyErr_BadArgument();
5125 return NULL;
5126 }
5127
5128 if (PyUnicode_READY(unicode) == -1)
5129 return NULL;
5130
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005131 if (PyUnicode_UTF8(unicode))
5132 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5133 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134
5135 kind = PyUnicode_KIND(unicode);
5136 data = PyUnicode_DATA(unicode);
5137 size = PyUnicode_GET_LENGTH(unicode);
5138
Benjamin Petersonead6b532011-12-20 17:23:42 -06005139 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005140 default:
5141 assert(0);
5142 case PyUnicode_1BYTE_KIND:
5143 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5144 assert(!PyUnicode_IS_ASCII(unicode));
5145 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5146 case PyUnicode_2BYTE_KIND:
5147 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5148 case PyUnicode_4BYTE_KIND:
5149 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151}
5152
Alexander Belopolsky40018472011-02-26 01:02:56 +00005153PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5155 Py_ssize_t size,
5156 const char *errors)
5157{
5158 PyObject *v, *unicode;
5159
5160 unicode = PyUnicode_FromUnicode(s, size);
5161 if (unicode == NULL)
5162 return NULL;
5163 v = _PyUnicode_AsUTF8String(unicode, errors);
5164 Py_DECREF(unicode);
5165 return v;
5166}
5167
5168PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005169PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005171 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172}
5173
Walter Dörwald41980ca2007-08-16 21:55:45 +00005174/* --- UTF-32 Codec ------------------------------------------------------- */
5175
5176PyObject *
5177PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 Py_ssize_t size,
5179 const char *errors,
5180 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005181{
5182 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5183}
5184
5185PyObject *
5186PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 Py_ssize_t size,
5188 const char *errors,
5189 int *byteorder,
5190 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005191{
5192 const char *starts = s;
5193 Py_ssize_t startinpos;
5194 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005195 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005196 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005197 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005198 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005199 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200 PyObject *errorHandler = NULL;
5201 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005202
Walter Dörwald41980ca2007-08-16 21:55:45 +00005203 q = (unsigned char *)s;
5204 e = q + size;
5205
5206 if (byteorder)
5207 bo = *byteorder;
5208
5209 /* Check for BOM marks (U+FEFF) in the input and adjust current
5210 byte order setting accordingly. In native mode, the leading BOM
5211 mark is skipped, in all other modes, it is copied to the output
5212 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005213 if (bo == 0 && size >= 4) {
5214 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5215 if (bom == 0x0000FEFF) {
5216 bo = -1;
5217 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005219 else if (bom == 0xFFFE0000) {
5220 bo = 1;
5221 q += 4;
5222 }
5223 if (byteorder)
5224 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225 }
5226
Victor Stinnere64322e2012-10-30 23:12:47 +01005227 if (q == e) {
5228 if (consumed)
5229 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005230 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005231 }
5232
Victor Stinnere64322e2012-10-30 23:12:47 +01005233#ifdef WORDS_BIGENDIAN
5234 le = bo < 0;
5235#else
5236 le = bo <= 0;
5237#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005238 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005239
Victor Stinner8f674cc2013-04-17 23:02:17 +02005240 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005241 writer.min_length = (e - q + 3) / 4;
5242 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005243 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005244
Victor Stinnere64322e2012-10-30 23:12:47 +01005245 while (1) {
5246 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005247 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005248
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005250 enum PyUnicode_Kind kind = writer.kind;
5251 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005252 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005253 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005254 if (le) {
5255 do {
5256 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5257 if (ch > maxch)
5258 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005259 if (kind != PyUnicode_1BYTE_KIND &&
5260 Py_UNICODE_IS_SURROGATE(ch))
5261 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005262 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005263 q += 4;
5264 } while (q <= last);
5265 }
5266 else {
5267 do {
5268 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5269 if (ch > maxch)
5270 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005271 if (kind != PyUnicode_1BYTE_KIND &&
5272 Py_UNICODE_IS_SURROGATE(ch))
5273 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005274 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005275 q += 4;
5276 } while (q <= last);
5277 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005278 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 }
5280
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005281 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005282 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005283 startinpos = ((const char *)q) - starts;
5284 endinpos = startinpos + 4;
5285 }
5286 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005289 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 startinpos = ((const char *)q) - starts;
5292 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 else {
5295 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005296 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 goto onError;
5298 q += 4;
5299 continue;
5300 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005301 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 startinpos = ((const char *)q) - starts;
5303 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005305
5306 /* The remaining input chars are ignored if the callback
5307 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005310 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005313 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314 }
5315
Walter Dörwald41980ca2007-08-16 21:55:45 +00005316 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005318
Walter Dörwald41980ca2007-08-16 21:55:45 +00005319 Py_XDECREF(errorHandler);
5320 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005321 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005322
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005324 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005325 Py_XDECREF(errorHandler);
5326 Py_XDECREF(exc);
5327 return NULL;
5328}
5329
5330PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005331_PyUnicode_EncodeUTF32(PyObject *str,
5332 const char *errors,
5333 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005334{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005335 enum PyUnicode_Kind kind;
5336 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005337 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005338 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005339 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005340#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005341 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005342#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005343 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005344#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005345 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005346 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005347 PyObject *errorHandler = NULL;
5348 PyObject *exc = NULL;
5349 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005350
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005351 if (!PyUnicode_Check(str)) {
5352 PyErr_BadArgument();
5353 return NULL;
5354 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005355 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005356 return NULL;
5357 kind = PyUnicode_KIND(str);
5358 data = PyUnicode_DATA(str);
5359 len = PyUnicode_GET_LENGTH(str);
5360
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005361 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005362 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005363 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005364 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 if (v == NULL)
5366 return NULL;
5367
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 /* output buffer is 4-bytes aligned */
5369 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5370 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005372 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005376 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005377 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005378 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005379 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005380 else
5381 encoding = "utf-32";
5382
5383 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005384 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5385 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386 }
5387
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 pos = 0;
5389 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005390 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391
5392 if (kind == PyUnicode_2BYTE_KIND) {
5393 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5394 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005395 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005396 else {
5397 assert(kind == PyUnicode_4BYTE_KIND);
5398 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5399 &out, native_ordering);
5400 }
5401 if (pos == len)
5402 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005404 rep = unicode_encode_call_errorhandler(
5405 errors, &errorHandler,
5406 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005408 if (!rep)
5409 goto error;
5410
5411 if (PyBytes_Check(rep)) {
5412 repsize = PyBytes_GET_SIZE(rep);
5413 if (repsize & 3) {
5414 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005415 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005416 "surrogates not allowed");
5417 goto error;
5418 }
5419 moreunits = repsize / 4;
5420 }
5421 else {
5422 assert(PyUnicode_Check(rep));
5423 if (PyUnicode_READY(rep) < 0)
5424 goto error;
5425 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5426 if (!PyUnicode_IS_ASCII(rep)) {
5427 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 "surrogates not allowed");
5430 goto error;
5431 }
5432 }
5433
5434 /* four bytes are reserved for each surrogate */
5435 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 Py_ssize_t morebytes = 4 * (moreunits - 1);
5438 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5439 /* integer overflow */
5440 PyErr_NoMemory();
5441 goto error;
5442 }
5443 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5444 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005445 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 }
5447
5448 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5450 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005451 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005452 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005453 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5454 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005455 }
5456
5457 Py_CLEAR(rep);
5458 }
5459
5460 /* Cut back to size actually needed. This is necessary for, for example,
5461 encoding of a string containing isolated surrogates and the 'ignore'
5462 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005463 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 if (nsize != PyBytes_GET_SIZE(v))
5465 _PyBytes_Resize(&v, nsize);
5466 Py_XDECREF(errorHandler);
5467 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005468 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005469 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 error:
5471 Py_XDECREF(rep);
5472 Py_XDECREF(errorHandler);
5473 Py_XDECREF(exc);
5474 Py_XDECREF(v);
5475 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005476}
5477
Alexander Belopolsky40018472011-02-26 01:02:56 +00005478PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5480 Py_ssize_t size,
5481 const char *errors,
5482 int byteorder)
5483{
5484 PyObject *result;
5485 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5486 if (tmp == NULL)
5487 return NULL;
5488 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5489 Py_DECREF(tmp);
5490 return result;
5491}
5492
5493PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005494PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005495{
Victor Stinnerb960b342011-11-20 19:12:52 +01005496 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005497}
5498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499/* --- UTF-16 Codec ------------------------------------------------------- */
5500
Tim Peters772747b2001-08-09 22:21:55 +00005501PyObject *
5502PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 Py_ssize_t size,
5504 const char *errors,
5505 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506{
Walter Dörwald69652032004-09-07 20:24:22 +00005507 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5508}
5509
5510PyObject *
5511PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 Py_ssize_t size,
5513 const char *errors,
5514 int *byteorder,
5515 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005518 Py_ssize_t startinpos;
5519 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005521 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005522 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005523 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005524 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005525 PyObject *errorHandler = NULL;
5526 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005527 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Tim Peters772747b2001-08-09 22:21:55 +00005529 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005530 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531
5532 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005533 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005535 /* Check for BOM marks (U+FEFF) in the input and adjust current
5536 byte order setting accordingly. In native mode, the leading BOM
5537 mark is skipped, in all other modes, it is copied to the output
5538 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005539 if (bo == 0 && size >= 2) {
5540 const Py_UCS4 bom = (q[1] << 8) | q[0];
5541 if (bom == 0xFEFF) {
5542 q += 2;
5543 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005545 else if (bom == 0xFFFE) {
5546 q += 2;
5547 bo = 1;
5548 }
5549 if (byteorder)
5550 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552
Antoine Pitrou63065d72012-05-15 23:48:04 +02005553 if (q == e) {
5554 if (consumed)
5555 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005556 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005557 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005558
Christian Heimes743e0cd2012-10-17 23:52:17 +02005559#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005560 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005561 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005562#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005564 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005565#endif
Tim Peters772747b2001-08-09 22:21:55 +00005566
Antoine Pitrou63065d72012-05-15 23:48:04 +02005567 /* Note: size will always be longer than the resulting Unicode
5568 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005569 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005570 writer.min_length = (e - q + 1) / 2;
5571 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005572 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 while (1) {
5575 Py_UCS4 ch = 0;
5576 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005577 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005579 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005580 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005581 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005582 native_ordering);
5583 else
5584 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005585 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005586 native_ordering);
5587 } else if (kind == PyUnicode_2BYTE_KIND) {
5588 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005589 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 native_ordering);
5591 } else {
5592 assert(kind == PyUnicode_4BYTE_KIND);
5593 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005594 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005595 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005596 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005597 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598
Antoine Pitrou63065d72012-05-15 23:48:04 +02005599 switch (ch)
5600 {
5601 case 0:
5602 /* remaining byte at the end? (size should be even) */
5603 if (q == e || consumed)
5604 goto End;
5605 errmsg = "truncated data";
5606 startinpos = ((const char *)q) - starts;
5607 endinpos = ((const char *)e) - starts;
5608 break;
5609 /* The remaining input chars are ignored if the callback
5610 chooses to skip the input */
5611 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005612 q -= 2;
5613 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005614 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005616 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617 endinpos = ((const char *)e) - starts;
5618 break;
5619 case 2:
5620 errmsg = "illegal encoding";
5621 startinpos = ((const char *)q) - 2 - starts;
5622 endinpos = startinpos + 2;
5623 break;
5624 case 3:
5625 errmsg = "illegal UTF-16 surrogate";
5626 startinpos = ((const char *)q) - 4 - starts;
5627 endinpos = startinpos + 2;
5628 break;
5629 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005630 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005631 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 continue;
5633 }
5634
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005636 errors,
5637 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005638 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005639 &starts,
5640 (const char **)&e,
5641 &startinpos,
5642 &endinpos,
5643 &exc,
5644 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005645 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 }
5648
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649End:
Walter Dörwald69652032004-09-07 20:24:22 +00005650 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005652
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 Py_XDECREF(errorHandler);
5654 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005655 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005658 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 Py_XDECREF(errorHandler);
5660 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 return NULL;
5662}
5663
Tim Peters772747b2001-08-09 22:21:55 +00005664PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665_PyUnicode_EncodeUTF16(PyObject *str,
5666 const char *errors,
5667 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005669 enum PyUnicode_Kind kind;
5670 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005671 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005672 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005673 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005674 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005675#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005676 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005677#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005678 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005679#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 const char *encoding;
5681 Py_ssize_t nsize, pos;
5682 PyObject *errorHandler = NULL;
5683 PyObject *exc = NULL;
5684 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005685
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005686 if (!PyUnicode_Check(str)) {
5687 PyErr_BadArgument();
5688 return NULL;
5689 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005690 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005691 return NULL;
5692 kind = PyUnicode_KIND(str);
5693 data = PyUnicode_DATA(str);
5694 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005695
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005696 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005697 if (kind == PyUnicode_4BYTE_KIND) {
5698 const Py_UCS4 *in = (const Py_UCS4 *)data;
5699 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005700 while (in < end) {
5701 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005702 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005703 }
5704 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005705 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005706 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005708 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005709 nsize = len + pairs + (byteorder == 0);
5710 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005711 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005715 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005716 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005717 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005718 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005719 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005720 }
5721 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005722 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005723 }
Tim Peters772747b2001-08-09 22:21:55 +00005724
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 if (kind == PyUnicode_1BYTE_KIND) {
5726 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5727 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005728 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005729
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005730 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005732 }
5733 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005734 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005735 }
5736 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005737 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005738 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739
5740 pos = 0;
5741 while (pos < len) {
5742 Py_ssize_t repsize, moreunits;
5743
5744 if (kind == PyUnicode_2BYTE_KIND) {
5745 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5746 &out, native_ordering);
5747 }
5748 else {
5749 assert(kind == PyUnicode_4BYTE_KIND);
5750 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5751 &out, native_ordering);
5752 }
5753 if (pos == len)
5754 break;
5755
5756 rep = unicode_encode_call_errorhandler(
5757 errors, &errorHandler,
5758 encoding, "surrogates not allowed",
5759 str, &exc, pos, pos + 1, &pos);
5760 if (!rep)
5761 goto error;
5762
5763 if (PyBytes_Check(rep)) {
5764 repsize = PyBytes_GET_SIZE(rep);
5765 if (repsize & 1) {
5766 raise_encode_exception(&exc, encoding,
5767 str, pos - 1, pos,
5768 "surrogates not allowed");
5769 goto error;
5770 }
5771 moreunits = repsize / 2;
5772 }
5773 else {
5774 assert(PyUnicode_Check(rep));
5775 if (PyUnicode_READY(rep) < 0)
5776 goto error;
5777 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5778 if (!PyUnicode_IS_ASCII(rep)) {
5779 raise_encode_exception(&exc, encoding,
5780 str, pos - 1, pos,
5781 "surrogates not allowed");
5782 goto error;
5783 }
5784 }
5785
5786 /* two bytes are reserved for each surrogate */
5787 if (moreunits > 1) {
5788 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5789 Py_ssize_t morebytes = 2 * (moreunits - 1);
5790 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5791 /* integer overflow */
5792 PyErr_NoMemory();
5793 goto error;
5794 }
5795 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5796 goto error;
5797 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5798 }
5799
5800 if (PyBytes_Check(rep)) {
5801 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5802 out += moreunits;
5803 } else /* rep is unicode */ {
5804 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5805 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5806 &out, native_ordering);
5807 }
5808
5809 Py_CLEAR(rep);
5810 }
5811
5812 /* Cut back to size actually needed. This is necessary for, for example,
5813 encoding of a string containing isolated surrogates and the 'ignore' handler
5814 is used. */
5815 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5816 if (nsize != PyBytes_GET_SIZE(v))
5817 _PyBytes_Resize(&v, nsize);
5818 Py_XDECREF(errorHandler);
5819 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005820 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005821 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005822 error:
5823 Py_XDECREF(rep);
5824 Py_XDECREF(errorHandler);
5825 Py_XDECREF(exc);
5826 Py_XDECREF(v);
5827 return NULL;
5828#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829}
5830
Alexander Belopolsky40018472011-02-26 01:02:56 +00005831PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005832PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5833 Py_ssize_t size,
5834 const char *errors,
5835 int byteorder)
5836{
5837 PyObject *result;
5838 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5839 if (tmp == NULL)
5840 return NULL;
5841 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5842 Py_DECREF(tmp);
5843 return result;
5844}
5845
5846PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005847PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005849 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850}
5851
5852/* --- Unicode Escape Codec ----------------------------------------------- */
5853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005854/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5855 if all the escapes in the string make it still a valid ASCII string.
5856 Returns -1 if any escapes were found which cause the string to
5857 pop out of ASCII range. Otherwise returns the length of the
5858 required buffer to hold the string.
5859 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005860static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005861length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5862{
5863 const unsigned char *p = (const unsigned char *)s;
5864 const unsigned char *end = p + size;
5865 Py_ssize_t length = 0;
5866
5867 if (size < 0)
5868 return -1;
5869
5870 for (; p < end; ++p) {
5871 if (*p > 127) {
5872 /* Non-ASCII */
5873 return -1;
5874 }
5875 else if (*p != '\\') {
5876 /* Normal character */
5877 ++length;
5878 }
5879 else {
5880 /* Backslash-escape, check next char */
5881 ++p;
5882 /* Escape sequence reaches till end of string or
5883 non-ASCII follow-up. */
5884 if (p >= end || *p > 127)
5885 return -1;
5886 switch (*p) {
5887 case '\n':
5888 /* backslash + \n result in zero characters */
5889 break;
5890 case '\\': case '\'': case '\"':
5891 case 'b': case 'f': case 't':
5892 case 'n': case 'r': case 'v': case 'a':
5893 ++length;
5894 break;
5895 case '0': case '1': case '2': case '3':
5896 case '4': case '5': case '6': case '7':
5897 case 'x': case 'u': case 'U': case 'N':
5898 /* these do not guarantee ASCII characters */
5899 return -1;
5900 default:
5901 /* count the backslash + the other character */
5902 length += 2;
5903 }
5904 }
5905 }
5906 return length;
5907}
5908
Fredrik Lundh06d12682001-01-24 07:59:11 +00005909static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005910
Alexander Belopolsky40018472011-02-26 01:02:56 +00005911PyObject *
5912PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005913 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005917 Py_ssize_t startinpos;
5918 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005919 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005921 char* message;
5922 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 PyObject *errorHandler = NULL;
5924 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005925 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005926
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005928 if (len == 0)
5929 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930
5931 /* After length_of_escaped_ascii_string() there are two alternatives,
5932 either the string is pure ASCII with named escapes like \n, etc.
5933 and we determined it's exact size (common case)
5934 or it contains \x, \u, ... escape sequences. then we create a
5935 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005936 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005937 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005938 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005939 }
5940 else {
5941 /* Escaped strings will always be longer than the resulting
5942 Unicode string, so we start with size here and then reduce the
5943 length after conversion to the true value.
5944 (but if the error callback returns a long replacement string
5945 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005946 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005947 }
5948
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005950 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005952
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 while (s < end) {
5954 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005955 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
5958 /* Non-escape characters are interpreted as Unicode ordinals */
5959 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005960 x = (unsigned char)*s;
5961 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005962 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005963 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 continue;
5965 }
5966
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 /* \ - Escapes */
5969 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005970 c = *s++;
5971 if (s > end)
5972 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005973
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005974 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005977#define WRITECHAR(ch) \
5978 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005979 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005980 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005981 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005982
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005984 case '\\': WRITECHAR('\\'); break;
5985 case '\'': WRITECHAR('\''); break;
5986 case '\"': WRITECHAR('\"'); break;
5987 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005989 case 'f': WRITECHAR('\014'); break;
5990 case 't': WRITECHAR('\t'); break;
5991 case 'n': WRITECHAR('\n'); break;
5992 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005993 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005994 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005995 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005996 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 case '0': case '1': case '2': case '3':
6000 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006001 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006002 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006003 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006004 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006005 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006007 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 break;
6009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 /* hex escapes */
6011 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 digits = 2;
6014 message = "truncated \\xXX escape";
6015 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019 digits = 4;
6020 message = "truncated \\uXXXX escape";
6021 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006024 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006025 digits = 8;
6026 message = "truncated \\UXXXXXXXX escape";
6027 hexescape:
6028 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006029 if (end - s < digits) {
6030 /* count only hex digits */
6031 for (; s < end; ++s) {
6032 c = (unsigned char)*s;
6033 if (!Py_ISXDIGIT(c))
6034 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006035 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006036 goto error;
6037 }
6038 for (; digits--; ++s) {
6039 c = (unsigned char)*s;
6040 if (!Py_ISXDIGIT(c))
6041 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006042 chr = (chr<<4) & ~0xF;
6043 if (c >= '0' && c <= '9')
6044 chr += c - '0';
6045 else if (c >= 'a' && c <= 'f')
6046 chr += 10 + c - 'a';
6047 else
6048 chr += 10 + c - 'A';
6049 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006050 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 /* _decoding_error will have already written into the
6052 target buffer. */
6053 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006055 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006056 message = "illegal Unicode character";
6057 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02006058 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006059 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 break;
6061
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 case 'N':
6064 message = "malformed \\N character escape";
6065 if (ucnhash_CAPI == NULL) {
6066 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006067 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6068 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 if (ucnhash_CAPI == NULL)
6070 goto ucnhashError;
6071 }
6072 if (*s == '{') {
6073 const char *start = s+1;
6074 /* look for the closing brace */
6075 while (*s != '}' && s < end)
6076 s++;
6077 if (s > start && s < end && *s == '}') {
6078 /* found a name. look it up in the unicode database */
6079 message = "unknown Unicode character name";
6080 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02006081 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02006082 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006083 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006084 goto store;
6085 }
6086 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006087 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006088
6089 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006090 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 message = "\\ at end of string";
6092 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006093 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006094 }
6095 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006096 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006097 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006098 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006099 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006101 continue;
6102
6103 error:
6104 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006105 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006106 errors, &errorHandler,
6107 "unicodeescape", message,
6108 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006109 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006110 goto onError;
6111 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006113#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006114
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006117 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006118
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006120 PyErr_SetString(
6121 PyExc_UnicodeError,
6122 "\\N escapes not supported (can't load unicodedata module)"
6123 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006124 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 Py_XDECREF(errorHandler);
6126 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006127 return NULL;
6128
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006130 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 Py_XDECREF(errorHandler);
6132 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 return NULL;
6134}
6135
6136/* Return a Unicode-Escape string version of the Unicode object.
6137
6138 If quotes is true, the string is enclosed in u"" or u'' quotes as
6139 appropriate.
6140
6141*/
6142
Alexander Belopolsky40018472011-02-26 01:02:56 +00006143PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 int kind;
6149 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006150 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Ezio Melottie7f90372012-10-05 03:33:31 +03006152 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006153 escape.
6154
Ezio Melottie7f90372012-10-05 03:33:31 +03006155 For UCS1 strings it's '\xxx', 4 bytes per source character.
6156 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6157 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006158 */
6159
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 if (!PyUnicode_Check(unicode)) {
6161 PyErr_BadArgument();
6162 return NULL;
6163 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006164 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006166
6167 _PyBytesWriter_Init(&writer);
6168
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 len = PyUnicode_GET_LENGTH(unicode);
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172
Victor Stinner358af132015-10-12 22:36:57 +02006173 p = _PyBytesWriter_Alloc(&writer, len);
6174 if (p == NULL)
6175 goto error;
6176 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006179 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006180
Walter Dörwald79e913e2007-05-12 11:08:06 +00006181 /* Escape backslashes */
6182 if (ch == '\\') {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006183 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006184 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6185 if (p == NULL)
6186 goto error;
6187
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 *p++ = '\\';
6189 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006190 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006191 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006192
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006193 /* Map 21-bit characters to '\U00xxxxxx' */
6194 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006195 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006196
6197 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6198 if (p == NULL)
6199 goto error;
6200
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006201 *p++ = '\\';
6202 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006203 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6204 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6205 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6206 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6207 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6210 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006212 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006213
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006215 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006216 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6217 if (p == NULL)
6218 goto error;
6219
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 *p++ = '\\';
6221 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006222 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6223 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6224 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6225 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006227
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006228 /* Map special whitespace to '\t', \n', '\r' */
6229 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006230 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6231 if (p == NULL)
6232 goto error;
6233
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006234 *p++ = '\\';
6235 *p++ = 't';
6236 }
6237 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006238 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6239 if (p == NULL)
6240 goto error;
6241
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006242 *p++ = '\\';
6243 *p++ = 'n';
6244 }
6245 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006246 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6247 if (p == NULL)
6248 goto error;
6249
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006250 *p++ = '\\';
6251 *p++ = 'r';
6252 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006253
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006254 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006255 else if (ch < ' ' || ch >= 0x7F) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006256 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006257 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6258 if (p == NULL)
6259 goto error;
6260
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006262 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006263 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6264 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006265 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006266
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 /* Copy everything else as-is */
6268 else
6269 *p++ = (char) ch;
6270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
Victor Stinner358af132015-10-12 22:36:57 +02006272 return _PyBytesWriter_Finish(&writer, p);
6273
6274error:
6275 _PyBytesWriter_Dealloc(&writer);
6276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277}
6278
Alexander Belopolsky40018472011-02-26 01:02:56 +00006279PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006280PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6281 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006283 PyObject *result;
6284 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6285 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006287 result = PyUnicode_AsUnicodeEscapeString(tmp);
6288 Py_DECREF(tmp);
6289 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290}
6291
6292/* --- Raw Unicode Escape Codec ------------------------------------------- */
6293
Alexander Belopolsky40018472011-02-26 01:02:56 +00006294PyObject *
6295PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006296 Py_ssize_t size,
6297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006300 Py_ssize_t startinpos;
6301 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006302 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 const char *end;
6304 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006305 PyObject *errorHandler = NULL;
6306 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006307
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006308 if (size == 0)
6309 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006310
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 /* Escaped strings will always be longer than the resulting
6312 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313 length after conversion to the true value. (But decoding error
6314 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006315 _PyUnicodeWriter_Init(&writer);
6316 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006317
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 end = s + size;
6319 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 unsigned char c;
6321 Py_UCS4 x;
6322 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006323 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 /* Non-escape characters are interpreted as Unicode ordinals */
6326 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006327 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006328 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006329 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006331 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 startinpos = s-starts;
6333
6334 /* \u-escapes are only interpreted iff the number of leading
6335 backslashes if odd */
6336 bs = s;
6337 for (;s < end;) {
6338 if (*s != '\\')
6339 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006340 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006341 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006342 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 }
6344 if (((s - bs) & 1) == 0 ||
6345 s >= end ||
6346 (*s != 'u' && *s != 'U')) {
6347 continue;
6348 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006349 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 count = *s=='u' ? 4 : 8;
6351 s++;
6352
6353 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 for (x = 0, i = 0; i < count; ++i, ++s) {
6355 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006356 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006358 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 errors, &errorHandler,
6360 "rawunicodeescape", "truncated \\uXXXX",
6361 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006362 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 goto onError;
6364 goto nextByte;
6365 }
6366 x = (x<<4) & ~0xF;
6367 if (c >= '0' && c <= '9')
6368 x += c - '0';
6369 else if (c >= 'a' && c <= 'f')
6370 x += 10 + c - 'a';
6371 else
6372 x += 10 + c - 'A';
6373 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006374 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006375 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006376 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006377 }
6378 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006379 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006380 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006381 errors, &errorHandler,
6382 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006386 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 nextByte:
6388 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 Py_XDECREF(errorHandler);
6391 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006392 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006393
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006395 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(errorHandler);
6397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 return NULL;
6399}
6400
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401
Alexander Belopolsky40018472011-02-26 01:02:56 +00006402PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006406 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407 int kind;
6408 void *data;
6409 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006410 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 if (!PyUnicode_Check(unicode)) {
6413 PyErr_BadArgument();
6414 return NULL;
6415 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006416 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006417 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006418
6419 _PyBytesWriter_Init(&writer);
6420
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 kind = PyUnicode_KIND(unicode);
6422 data = PyUnicode_DATA(unicode);
6423 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006424
Victor Stinner358af132015-10-12 22:36:57 +02006425 p = _PyBytesWriter_Alloc(&writer, len);
6426 if (p == NULL)
6427 goto error;
6428 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006429
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006430 for (pos = 0; pos < len; pos++) {
6431 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 /* Map 32-bit characters to '\Uxxxxxxxx' */
6433 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006434 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006435
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006436 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006437 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6438 if (p == NULL)
6439 goto error;
6440
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006441 *p++ = '\\';
6442 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006443 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6444 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6445 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6446 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6447 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6449 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6450 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006451 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006453 else if (ch >= 256) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006454 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006455 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6456 if (p == NULL)
6457 goto error;
6458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 *p++ = '\\';
6460 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006461 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6464 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 /* Copy everything else as-is */
6467 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 *p++ = (char) ch;
6469 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006470
Victor Stinner358af132015-10-12 22:36:57 +02006471 return _PyBytesWriter_Finish(&writer, p);
6472
6473error:
6474 _PyBytesWriter_Dealloc(&writer);
6475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476}
6477
Alexander Belopolsky40018472011-02-26 01:02:56 +00006478PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6480 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006482 PyObject *result;
6483 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6484 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006485 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6487 Py_DECREF(tmp);
6488 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006491/* --- Unicode Internal Codec ------------------------------------------- */
6492
Alexander Belopolsky40018472011-02-26 01:02:56 +00006493PyObject *
6494_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006495 Py_ssize_t size,
6496 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006497{
6498 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006499 Py_ssize_t startinpos;
6500 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006501 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006502 const char *end;
6503 const char *reason;
6504 PyObject *errorHandler = NULL;
6505 PyObject *exc = NULL;
6506
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006507 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006508 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006509 1))
6510 return NULL;
6511
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006512 if (size == 0)
6513 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006514
Victor Stinner8f674cc2013-04-17 23:02:17 +02006515 _PyUnicodeWriter_Init(&writer);
6516 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6517 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006519 }
6520 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006521
Victor Stinner8f674cc2013-04-17 23:02:17 +02006522 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006523 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006524 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006525 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006526 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006527 endinpos = end-starts;
6528 reason = "truncated input";
6529 goto error;
6530 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006531 /* We copy the raw representation one byte at a time because the
6532 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006533 ((char *) &uch)[0] = s[0];
6534 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006535#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006536 ((char *) &uch)[2] = s[2];
6537 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006538#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006539 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006540#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006541 /* We have to sanity check the raw data, otherwise doom looms for
6542 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006543 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006544 endinpos = s - starts + Py_UNICODE_SIZE;
6545 reason = "illegal code point (> 0x10FFFF)";
6546 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006547 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006548#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006549 s += Py_UNICODE_SIZE;
6550#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006551 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006552 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006553 Py_UNICODE uch2;
6554 ((char *) &uch2)[0] = s[0];
6555 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006556 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006557 {
Victor Stinner551ac952011-11-29 22:58:13 +01006558 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006560 }
6561 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562#endif
6563
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006564 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006565 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006566 continue;
6567
6568 error:
6569 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006570 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006571 errors, &errorHandler,
6572 "unicode_internal", reason,
6573 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006574 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006575 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006576 }
6577
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578 Py_XDECREF(errorHandler);
6579 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006580 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006581
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006583 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006584 Py_XDECREF(errorHandler);
6585 Py_XDECREF(exc);
6586 return NULL;
6587}
6588
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589/* --- Latin-1 Codec ------------------------------------------------------ */
6590
Alexander Belopolsky40018472011-02-26 01:02:56 +00006591PyObject *
6592PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006593 Py_ssize_t size,
6594 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006597 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598}
6599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601static void
6602make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006603 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006604 PyObject *unicode,
6605 Py_ssize_t startpos, Py_ssize_t endpos,
6606 const char *reason)
6607{
6608 if (*exceptionObject == NULL) {
6609 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006611 encoding, unicode, startpos, endpos, reason);
6612 }
6613 else {
6614 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6615 goto onError;
6616 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6617 goto onError;
6618 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6619 goto onError;
6620 return;
6621 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006622 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006623 }
6624}
6625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627static void
6628raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006629 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006630 PyObject *unicode,
6631 Py_ssize_t startpos, Py_ssize_t endpos,
6632 const char *reason)
6633{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006634 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006635 encoding, unicode, startpos, endpos, reason);
6636 if (*exceptionObject != NULL)
6637 PyCodec_StrictErrors(*exceptionObject);
6638}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639
6640/* error handling callback helper:
6641 build arguments, call the callback and check the arguments,
6642 put the result into newpos and return the replacement string, which
6643 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644static PyObject *
6645unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 PyObject **errorHandler,
6647 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006649 Py_ssize_t startpos, Py_ssize_t endpos,
6650 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006652 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 PyObject *restuple;
6655 PyObject *resunicode;
6656
6657 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 }
6662
Benjamin Petersonbac79492012-01-14 13:34:47 -05006663 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 return NULL;
6665 len = PyUnicode_GET_LENGTH(unicode);
6666
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006667 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006668 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671
6672 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006677 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 Py_DECREF(restuple);
6679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006681 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 &resunicode, newpos)) {
6683 Py_DECREF(restuple);
6684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006686 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6687 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6688 Py_DECREF(restuple);
6689 return NULL;
6690 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006692 *newpos = len + *newpos;
6693 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006694 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 Py_DECREF(restuple);
6696 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698 Py_INCREF(resunicode);
6699 Py_DECREF(restuple);
6700 return resunicode;
6701}
6702
Alexander Belopolsky40018472011-02-26 01:02:56 +00006703static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006705 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006706 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006708 /* input state */
6709 Py_ssize_t pos=0, size;
6710 int kind;
6711 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 /* pointer into the output */
6713 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006714 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6715 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006716 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006718 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006719 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006720 /* output object */
6721 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722
Benjamin Petersonbac79492012-01-14 13:34:47 -05006723 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 return NULL;
6725 size = PyUnicode_GET_LENGTH(unicode);
6726 kind = PyUnicode_KIND(unicode);
6727 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 /* allocate enough for a simple encoding without
6729 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006730 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006731 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006732
6733 _PyBytesWriter_Init(&writer);
6734 str = _PyBytesWriter_Alloc(&writer, size);
6735 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006736 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006739 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006742 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006744 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006745 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006746 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006748 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006751 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006753
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006754 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006756
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006757 /* Only overallocate the buffer if it's not the last write */
6758 writer.overallocate = (collend < size);
6759
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006761 if (error_handler == _Py_ERROR_UNKNOWN)
6762 error_handler = get_error_handler(errors);
6763
6764 switch (error_handler) {
6765 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006766 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006768
6769 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006770 memset(str, '?', collend - collstart);
6771 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006772 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006773 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006774 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 break;
Victor Stinner50149202015-09-22 00:26:54 +02006776
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006777 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006778 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006779 writer.min_size -= (collend - collstart);
6780 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006781 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006782 if (str == NULL)
6783 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006784 pos = collend;
6785 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006786
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006787 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006788 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006789 writer.min_size -= (collend - collstart);
6790 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 unicode, collstart, collend);
6792 if (str == NULL)
6793 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 break;
Victor Stinner50149202015-09-22 00:26:54 +02006796
Victor Stinnerc3713e92015-09-29 12:32:13 +02006797 case _Py_ERROR_SURROGATEESCAPE:
6798 for (i = collstart; i < collend; ++i) {
6799 ch = PyUnicode_READ(kind, data, i);
6800 if (ch < 0xdc80 || 0xdcff < ch) {
6801 /* Not a UTF-8b surrogate */
6802 break;
6803 }
6804 *str++ = (char)(ch - 0xdc00);
6805 ++pos;
6806 }
6807 if (i >= collend)
6808 break;
6809 collstart = pos;
6810 assert(collstart != collend);
6811 /* fallback to general error handling */
6812
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006814 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6815 encoding, reason, unicode, &exc,
6816 collstart, collend, &newpos);
6817 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006819
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006820 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006821 writer.min_size -= 1;
6822
Victor Stinner6bd525b2015-10-09 13:10:05 +02006823 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006824 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006825 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006826 PyBytes_AS_STRING(rep),
6827 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006828 if (str == NULL)
6829 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006830 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006831 else {
6832 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006833
Victor Stinner6bd525b2015-10-09 13:10:05 +02006834 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006836
6837 if (PyUnicode_IS_ASCII(rep)) {
6838 /* Fast path: all characters are smaller than limit */
6839 assert(limit >= 128);
6840 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6841 str = _PyBytesWriter_WriteBytes(&writer, str,
6842 PyUnicode_DATA(rep),
6843 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 else {
6846 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6847
6848 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6849 if (str == NULL)
6850 goto onError;
6851
6852 /* check if there is anything unencodable in the
6853 replacement and copy it to the output */
6854 for (i = 0; repsize-->0; ++i, ++str) {
6855 ch = PyUnicode_READ_CHAR(rep, i);
6856 if (ch >= limit) {
6857 raise_encode_exception(&exc, encoding, unicode,
6858 pos, pos+1, reason);
6859 goto onError;
6860 }
6861 *str = (char)ch;
6862 }
6863 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006865 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006866 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006868
6869 /* If overallocation was disabled, ensure that it was the last
6870 write. Otherwise, we missed an optimization */
6871 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006872 }
6873 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874
Victor Stinner50149202015-09-22 00:26:54 +02006875 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006877 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006878
6879 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006880 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006881 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006882 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006883 Py_XDECREF(exc);
6884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885}
6886
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006888PyObject *
6889PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006890 Py_ssize_t size,
6891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 PyObject *result;
6894 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6895 if (unicode == NULL)
6896 return NULL;
6897 result = unicode_encode_ucs1(unicode, errors, 256);
6898 Py_DECREF(unicode);
6899 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900}
6901
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006903_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904{
6905 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 PyErr_BadArgument();
6907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006909 if (PyUnicode_READY(unicode) == -1)
6910 return NULL;
6911 /* Fast path: if it is a one-byte string, construct
6912 bytes object directly. */
6913 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6914 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6915 PyUnicode_GET_LENGTH(unicode));
6916 /* Non-Latin-1 characters present. Defer to above function to
6917 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006918 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006919}
6920
6921PyObject*
6922PyUnicode_AsLatin1String(PyObject *unicode)
6923{
6924 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
6927/* --- 7-bit ASCII Codec -------------------------------------------------- */
6928
Alexander Belopolsky40018472011-02-26 01:02:56 +00006929PyObject *
6930PyUnicode_DecodeASCII(const char *s,
6931 Py_ssize_t size,
6932 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006935 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006936 int kind;
6937 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006938 Py_ssize_t startinpos;
6939 Py_ssize_t endinpos;
6940 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006942 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006944 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006947 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006948
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006950 if (size == 1 && (unsigned char)s[0] < 128)
6951 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006952
Victor Stinner8f674cc2013-04-17 23:02:17 +02006953 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006954 writer.min_length = size;
6955 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006956 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006960 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006961 writer.pos = outpos;
6962 if (writer.pos == size)
6963 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006964
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 s += writer.pos;
6966 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006967 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006968 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 PyUnicode_WRITE(kind, data, writer.pos, c);
6971 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006975
6976 /* byte outsize range 0x00..0x7f: call the error handler */
6977
6978 if (error_handler == _Py_ERROR_UNKNOWN)
6979 error_handler = get_error_handler(errors);
6980
6981 switch (error_handler)
6982 {
6983 case _Py_ERROR_REPLACE:
6984 case _Py_ERROR_SURROGATEESCAPE:
6985 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006986 but we may switch to UCS2 at the first write */
6987 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6988 goto onError;
6989 kind = writer.kind;
6990 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006991
6992 if (error_handler == _Py_ERROR_REPLACE)
6993 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6994 else
6995 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6996 writer.pos++;
6997 ++s;
6998 break;
6999
7000 case _Py_ERROR_IGNORE:
7001 ++s;
7002 break;
7003
7004 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 startinpos = s-starts;
7006 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007008 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 "ascii", "ordinal not in range(128)",
7010 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007011 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 kind = writer.kind;
7014 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007017 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007018 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007019 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007022 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 return NULL;
7026}
7027
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007029PyObject *
7030PyUnicode_EncodeASCII(const Py_UNICODE *p,
7031 Py_ssize_t size,
7032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 PyObject *result;
7035 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7036 if (unicode == NULL)
7037 return NULL;
7038 result = unicode_encode_ucs1(unicode, errors, 128);
7039 Py_DECREF(unicode);
7040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
Alexander Belopolsky40018472011-02-26 01:02:56 +00007043PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045{
7046 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyErr_BadArgument();
7048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050 if (PyUnicode_READY(unicode) == -1)
7051 return NULL;
7052 /* Fast path: if it is an ASCII-only string, construct bytes object
7053 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007054 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7056 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007058}
7059
7060PyObject *
7061PyUnicode_AsASCIIString(PyObject *unicode)
7062{
7063 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Victor Stinner99b95382011-07-04 14:23:54 +02007066#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007067
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007068/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007069
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007070#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071#define NEED_RETRY
7072#endif
7073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074#ifndef WC_ERR_INVALID_CHARS
7075# define WC_ERR_INVALID_CHARS 0x0080
7076#endif
7077
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007078static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007079code_page_name(UINT code_page, PyObject **obj)
7080{
7081 *obj = NULL;
7082 if (code_page == CP_ACP)
7083 return "mbcs";
7084 if (code_page == CP_UTF7)
7085 return "CP_UTF7";
7086 if (code_page == CP_UTF8)
7087 return "CP_UTF8";
7088
7089 *obj = PyBytes_FromFormat("cp%u", code_page);
7090 if (*obj == NULL)
7091 return NULL;
7092 return PyBytes_AS_STRING(*obj);
7093}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094
Victor Stinner3a50e702011-10-18 21:21:00 +02007095static DWORD
7096decode_code_page_flags(UINT code_page)
7097{
7098 if (code_page == CP_UTF7) {
7099 /* The CP_UTF7 decoder only supports flags=0 */
7100 return 0;
7101 }
7102 else
7103 return MB_ERR_INVALID_CHARS;
7104}
7105
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 * Decode a byte string from a Windows code page into unicode object in strict
7108 * mode.
7109 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007110 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7111 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007114decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007115 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 const char *in,
7117 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118{
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007120 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122
7123 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 assert(insize > 0);
7125 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7126 if (outsize <= 0)
7127 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128
7129 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007131 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007132 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 if (*v == NULL)
7134 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136 }
7137 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007140 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 }
7144
7145 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7147 if (outsize <= 0)
7148 goto error;
7149 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007150
Victor Stinner3a50e702011-10-18 21:21:00 +02007151error:
7152 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7153 return -2;
7154 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007155 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156}
7157
Victor Stinner3a50e702011-10-18 21:21:00 +02007158/*
7159 * Decode a byte string from a code page into unicode object with an error
7160 * handler.
7161 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007162 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 * UnicodeDecodeError exception and returns -1 on error.
7164 */
7165static int
7166decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007167 PyObject **v,
7168 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007169 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007170{
7171 const char *startin = in;
7172 const char *endin = in + size;
7173 const DWORD flags = decode_code_page_flags(code_page);
7174 /* Ideally, we should get reason from FormatMessage. This is the Windows
7175 2000 English version of the message. */
7176 const char *reason = "No mapping for the Unicode character exists "
7177 "in the target code page.";
7178 /* each step cannot decode more than 1 character, but a character can be
7179 represented as a surrogate pair */
7180 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007181 int insize;
7182 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 PyObject *errorHandler = NULL;
7184 PyObject *exc = NULL;
7185 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007186 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 DWORD err;
7188 int ret = -1;
7189
7190 assert(size > 0);
7191
7192 encoding = code_page_name(code_page, &encoding_obj);
7193 if (encoding == NULL)
7194 return -1;
7195
Victor Stinner7d00cc12014-03-17 23:08:06 +01007196 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7198 UnicodeDecodeError. */
7199 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7200 if (exc != NULL) {
7201 PyCodec_StrictErrors(exc);
7202 Py_CLEAR(exc);
7203 }
7204 goto error;
7205 }
7206
7207 if (*v == NULL) {
7208 /* Create unicode object */
7209 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7210 PyErr_NoMemory();
7211 goto error;
7212 }
Victor Stinnerab595942011-12-17 04:59:06 +01007213 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 if (*v == NULL)
7216 goto error;
7217 startout = PyUnicode_AS_UNICODE(*v);
7218 }
7219 else {
7220 /* Extend unicode object */
7221 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7222 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7223 PyErr_NoMemory();
7224 goto error;
7225 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007226 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 goto error;
7228 startout = PyUnicode_AS_UNICODE(*v) + n;
7229 }
7230
7231 /* Decode the byte string character per character */
7232 out = startout;
7233 while (in < endin)
7234 {
7235 /* Decode a character */
7236 insize = 1;
7237 do
7238 {
7239 outsize = MultiByteToWideChar(code_page, flags,
7240 in, insize,
7241 buffer, Py_ARRAY_LENGTH(buffer));
7242 if (outsize > 0)
7243 break;
7244 err = GetLastError();
7245 if (err != ERROR_NO_UNICODE_TRANSLATION
7246 && err != ERROR_INSUFFICIENT_BUFFER)
7247 {
7248 PyErr_SetFromWindowsErr(0);
7249 goto error;
7250 }
7251 insize++;
7252 }
7253 /* 4=maximum length of a UTF-8 sequence */
7254 while (insize <= 4 && (in + insize) <= endin);
7255
7256 if (outsize <= 0) {
7257 Py_ssize_t startinpos, endinpos, outpos;
7258
Victor Stinner7d00cc12014-03-17 23:08:06 +01007259 /* last character in partial decode? */
7260 if (in + insize >= endin && !final)
7261 break;
7262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 startinpos = in - startin;
7264 endinpos = startinpos + 1;
7265 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007266 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 errors, &errorHandler,
7268 encoding, reason,
7269 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007270 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 {
7272 goto error;
7273 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007274 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 }
7276 else {
7277 in += insize;
7278 memcpy(out, buffer, outsize * sizeof(wchar_t));
7279 out += outsize;
7280 }
7281 }
7282
7283 /* write a NUL character at the end */
7284 *out = 0;
7285
7286 /* Extend unicode object */
7287 outsize = out - startout;
7288 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007289 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007291 /* (in - startin) <= size and size is an int */
7292 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007293
7294error:
7295 Py_XDECREF(encoding_obj);
7296 Py_XDECREF(errorHandler);
7297 Py_XDECREF(exc);
7298 return ret;
7299}
7300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301static PyObject *
7302decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 const char *s, Py_ssize_t size,
7304 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305{
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 PyObject *v = NULL;
7307 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 if (code_page < 0) {
7310 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7311 return NULL;
7312 }
7313
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 do
7318 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007320 if (size > INT_MAX) {
7321 chunk_size = INT_MAX;
7322 final = 0;
7323 done = 0;
7324 }
7325 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007327 {
7328 chunk_size = (int)size;
7329 final = (consumed == NULL);
7330 done = 1;
7331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 if (chunk_size == 0 && done) {
7334 if (v != NULL)
7335 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007336 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 converted = decode_code_page_strict(code_page, &v,
7340 s, chunk_size);
7341 if (converted == -2)
7342 converted = decode_code_page_errors(code_page, &v,
7343 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007344 errors, final);
7345 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007346
7347 if (converted < 0) {
7348 Py_XDECREF(v);
7349 return NULL;
7350 }
7351
7352 if (consumed)
7353 *consumed += converted;
7354
7355 s += converted;
7356 size -= converted;
7357 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007358
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007359 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360}
7361
Alexander Belopolsky40018472011-02-26 01:02:56 +00007362PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007363PyUnicode_DecodeCodePageStateful(int code_page,
7364 const char *s,
7365 Py_ssize_t size,
7366 const char *errors,
7367 Py_ssize_t *consumed)
7368{
7369 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7370}
7371
7372PyObject *
7373PyUnicode_DecodeMBCSStateful(const char *s,
7374 Py_ssize_t size,
7375 const char *errors,
7376 Py_ssize_t *consumed)
7377{
7378 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7379}
7380
7381PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007382PyUnicode_DecodeMBCS(const char *s,
7383 Py_ssize_t size,
7384 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007385{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7387}
7388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389static DWORD
7390encode_code_page_flags(UINT code_page, const char *errors)
7391{
7392 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007393 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 }
7395 else if (code_page == CP_UTF7) {
7396 /* CP_UTF7 only supports flags=0 */
7397 return 0;
7398 }
7399 else {
7400 if (errors != NULL && strcmp(errors, "replace") == 0)
7401 return 0;
7402 else
7403 return WC_NO_BEST_FIT_CHARS;
7404 }
7405}
7406
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007407/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 * Encode a Unicode string to a Windows code page into a byte string in strict
7409 * mode.
7410 *
7411 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007412 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007414static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007415encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007416 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418{
Victor Stinner554f3f02010-06-16 23:33:54 +00007419 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 BOOL *pusedDefaultChar = &usedDefaultChar;
7421 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007422 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const DWORD flags = encode_code_page_flags(code_page, NULL);
7425 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007426 /* Create a substring so that we can get the UTF-16 representation
7427 of just the slice under consideration. */
7428 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429
Martin v. Löwis3d325192011-11-04 18:23:06 +01007430 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007431
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007433 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007435 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007436
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 substring = PyUnicode_Substring(unicode, offset, offset+len);
7438 if (substring == NULL)
7439 return -1;
7440 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7441 if (p == NULL) {
7442 Py_DECREF(substring);
7443 return -1;
7444 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007445 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007447 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007449 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 NULL, 0,
7451 NULL, pusedDefaultChar);
7452 if (outsize <= 0)
7453 goto error;
7454 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 if (pusedDefaultChar && *pusedDefaultChar) {
7456 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 if (*outbytes == NULL) {
7464 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468 }
7469 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 const Py_ssize_t n = PyBytes_Size(*outbytes);
7472 if (outsize > PY_SSIZE_T_MAX - n) {
7473 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7478 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482 }
7483
7484 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007486 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 out, outsize,
7488 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 if (outsize <= 0)
7491 goto error;
7492 if (pusedDefaultChar && *pusedDefaultChar)
7493 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007497 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7499 return -2;
7500 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007501 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007502}
7503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007505 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 * error handler.
7507 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007508 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 * -1 on other error.
7510 */
7511static int
7512encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007513 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007514 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007515{
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 Py_ssize_t pos = unicode_offset;
7518 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 /* Ideally, we should get reason from FormatMessage. This is the Windows
7520 2000 English version of the message. */
7521 const char *reason = "invalid character";
7522 /* 4=maximum length of a UTF-8 sequence */
7523 char buffer[4];
7524 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7525 Py_ssize_t outsize;
7526 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 PyObject *errorHandler = NULL;
7528 PyObject *exc = NULL;
7529 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007530 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 PyObject *rep;
7533 int ret = -1;
7534
7535 assert(insize > 0);
7536
7537 encoding = code_page_name(code_page, &encoding_obj);
7538 if (encoding == NULL)
7539 return -1;
7540
7541 if (errors == NULL || strcmp(errors, "strict") == 0) {
7542 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7543 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007544 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 if (exc != NULL) {
7546 PyCodec_StrictErrors(exc);
7547 Py_DECREF(exc);
7548 }
7549 Py_XDECREF(encoding_obj);
7550 return -1;
7551 }
7552
7553 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7554 pusedDefaultChar = &usedDefaultChar;
7555 else
7556 pusedDefaultChar = NULL;
7557
7558 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7559 PyErr_NoMemory();
7560 goto error;
7561 }
7562 outsize = insize * Py_ARRAY_LENGTH(buffer);
7563
7564 if (*outbytes == NULL) {
7565 /* Create string object */
7566 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7567 if (*outbytes == NULL)
7568 goto error;
7569 out = PyBytes_AS_STRING(*outbytes);
7570 }
7571 else {
7572 /* Extend string object */
7573 Py_ssize_t n = PyBytes_Size(*outbytes);
7574 if (n > PY_SSIZE_T_MAX - outsize) {
7575 PyErr_NoMemory();
7576 goto error;
7577 }
7578 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7579 goto error;
7580 out = PyBytes_AS_STRING(*outbytes) + n;
7581 }
7582
7583 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007584 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7587 wchar_t chars[2];
7588 int charsize;
7589 if (ch < 0x10000) {
7590 chars[0] = (wchar_t)ch;
7591 charsize = 1;
7592 }
7593 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007594 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7595 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007596 charsize = 2;
7597 }
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 buffer, Py_ARRAY_LENGTH(buffer),
7602 NULL, pusedDefaultChar);
7603 if (outsize > 0) {
7604 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7605 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 memcpy(out, buffer, outsize);
7608 out += outsize;
7609 continue;
7610 }
7611 }
7612 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7613 PyErr_SetFromWindowsErr(0);
7614 goto error;
7615 }
7616
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 rep = unicode_encode_call_errorhandler(
7618 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007619 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 if (rep == NULL)
7622 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007624
7625 if (PyBytes_Check(rep)) {
7626 outsize = PyBytes_GET_SIZE(rep);
7627 if (outsize != 1) {
7628 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7629 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7630 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7631 Py_DECREF(rep);
7632 goto error;
7633 }
7634 out = PyBytes_AS_STRING(*outbytes) + offset;
7635 }
7636 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7637 out += outsize;
7638 }
7639 else {
7640 Py_ssize_t i;
7641 enum PyUnicode_Kind kind;
7642 void *data;
7643
Benjamin Petersonbac79492012-01-14 13:34:47 -05007644 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 Py_DECREF(rep);
7646 goto error;
7647 }
7648
7649 outsize = PyUnicode_GET_LENGTH(rep);
7650 if (outsize != 1) {
7651 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7652 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7653 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7654 Py_DECREF(rep);
7655 goto error;
7656 }
7657 out = PyBytes_AS_STRING(*outbytes) + offset;
7658 }
7659 kind = PyUnicode_KIND(rep);
7660 data = PyUnicode_DATA(rep);
7661 for (i=0; i < outsize; i++) {
7662 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7663 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007664 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 encoding, unicode,
7666 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 "unable to encode error handler result to ASCII");
7668 Py_DECREF(rep);
7669 goto error;
7670 }
7671 *out = (unsigned char)ch;
7672 out++;
7673 }
7674 }
7675 Py_DECREF(rep);
7676 }
7677 /* write a NUL byte */
7678 *out = 0;
7679 outsize = out - PyBytes_AS_STRING(*outbytes);
7680 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7681 if (_PyBytes_Resize(outbytes, outsize) < 0)
7682 goto error;
7683 ret = 0;
7684
7685error:
7686 Py_XDECREF(encoding_obj);
7687 Py_XDECREF(errorHandler);
7688 Py_XDECREF(exc);
7689 return ret;
7690}
7691
Victor Stinner3a50e702011-10-18 21:21:00 +02007692static PyObject *
7693encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007694 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 const char *errors)
7696{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007697 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007699 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007701
Victor Stinner29dacf22015-01-26 16:41:32 +01007702 if (!PyUnicode_Check(unicode)) {
7703 PyErr_BadArgument();
7704 return NULL;
7705 }
7706
Benjamin Petersonbac79492012-01-14 13:34:47 -05007707 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007708 return NULL;
7709 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 if (code_page < 0) {
7712 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7713 return NULL;
7714 }
7715
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007717 return PyBytes_FromStringAndSize(NULL, 0);
7718
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 offset = 0;
7720 do
7721 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007722#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007723 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chunks. */
7725 if (len > INT_MAX/2) {
7726 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 done = 0;
7728 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007730#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 done = 1;
7734 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007735
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 errors);
7739 if (ret == -2)
7740 ret = encode_code_page_errors(code_page, &outbytes,
7741 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 if (ret < 0) {
7744 Py_XDECREF(outbytes);
7745 return NULL;
7746 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007747
Victor Stinner7581cef2011-11-03 22:32:33 +01007748 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007749 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007750 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 return outbytes;
7753}
7754
7755PyObject *
7756PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7757 Py_ssize_t size,
7758 const char *errors)
7759{
Victor Stinner7581cef2011-11-03 22:32:33 +01007760 PyObject *unicode, *res;
7761 unicode = PyUnicode_FromUnicode(p, size);
7762 if (unicode == NULL)
7763 return NULL;
7764 res = encode_code_page(CP_ACP, unicode, errors);
7765 Py_DECREF(unicode);
7766 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007767}
7768
7769PyObject *
7770PyUnicode_EncodeCodePage(int code_page,
7771 PyObject *unicode,
7772 const char *errors)
7773{
Victor Stinner7581cef2011-11-03 22:32:33 +01007774 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007775}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007776
Alexander Belopolsky40018472011-02-26 01:02:56 +00007777PyObject *
7778PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007779{
Victor Stinner7581cef2011-11-03 22:32:33 +01007780 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007781}
7782
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007783#undef NEED_RETRY
7784
Victor Stinner99b95382011-07-04 14:23:54 +02007785#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787/* --- Character Mapping Codec -------------------------------------------- */
7788
Victor Stinnerfb161b12013-04-18 01:44:27 +02007789static int
7790charmap_decode_string(const char *s,
7791 Py_ssize_t size,
7792 PyObject *mapping,
7793 const char *errors,
7794 _PyUnicodeWriter *writer)
7795{
7796 const char *starts = s;
7797 const char *e;
7798 Py_ssize_t startinpos, endinpos;
7799 PyObject *errorHandler = NULL, *exc = NULL;
7800 Py_ssize_t maplen;
7801 enum PyUnicode_Kind mapkind;
7802 void *mapdata;
7803 Py_UCS4 x;
7804 unsigned char ch;
7805
7806 if (PyUnicode_READY(mapping) == -1)
7807 return -1;
7808
7809 maplen = PyUnicode_GET_LENGTH(mapping);
7810 mapdata = PyUnicode_DATA(mapping);
7811 mapkind = PyUnicode_KIND(mapping);
7812
7813 e = s + size;
7814
7815 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7816 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7817 * is disabled in encoding aliases, latin1 is preferred because
7818 * its implementation is faster. */
7819 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7820 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7821 Py_UCS4 maxchar = writer->maxchar;
7822
7823 assert (writer->kind == PyUnicode_1BYTE_KIND);
7824 while (s < e) {
7825 ch = *s;
7826 x = mapdata_ucs1[ch];
7827 if (x > maxchar) {
7828 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7829 goto onError;
7830 maxchar = writer->maxchar;
7831 outdata = (Py_UCS1 *)writer->data;
7832 }
7833 outdata[writer->pos] = x;
7834 writer->pos++;
7835 ++s;
7836 }
7837 return 0;
7838 }
7839
7840 while (s < e) {
7841 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7842 enum PyUnicode_Kind outkind = writer->kind;
7843 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7844 if (outkind == PyUnicode_1BYTE_KIND) {
7845 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7846 Py_UCS4 maxchar = writer->maxchar;
7847 while (s < e) {
7848 ch = *s;
7849 x = mapdata_ucs2[ch];
7850 if (x > maxchar)
7851 goto Error;
7852 outdata[writer->pos] = x;
7853 writer->pos++;
7854 ++s;
7855 }
7856 break;
7857 }
7858 else if (outkind == PyUnicode_2BYTE_KIND) {
7859 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7860 while (s < e) {
7861 ch = *s;
7862 x = mapdata_ucs2[ch];
7863 if (x == 0xFFFE)
7864 goto Error;
7865 outdata[writer->pos] = x;
7866 writer->pos++;
7867 ++s;
7868 }
7869 break;
7870 }
7871 }
7872 ch = *s;
7873
7874 if (ch < maplen)
7875 x = PyUnicode_READ(mapkind, mapdata, ch);
7876 else
7877 x = 0xfffe; /* invalid value */
7878Error:
7879 if (x == 0xfffe)
7880 {
7881 /* undefined mapping */
7882 startinpos = s-starts;
7883 endinpos = startinpos+1;
7884 if (unicode_decode_call_errorhandler_writer(
7885 errors, &errorHandler,
7886 "charmap", "character maps to <undefined>",
7887 &starts, &e, &startinpos, &endinpos, &exc, &s,
7888 writer)) {
7889 goto onError;
7890 }
7891 continue;
7892 }
7893
7894 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7895 goto onError;
7896 ++s;
7897 }
7898 Py_XDECREF(errorHandler);
7899 Py_XDECREF(exc);
7900 return 0;
7901
7902onError:
7903 Py_XDECREF(errorHandler);
7904 Py_XDECREF(exc);
7905 return -1;
7906}
7907
7908static int
7909charmap_decode_mapping(const char *s,
7910 Py_ssize_t size,
7911 PyObject *mapping,
7912 const char *errors,
7913 _PyUnicodeWriter *writer)
7914{
7915 const char *starts = s;
7916 const char *e;
7917 Py_ssize_t startinpos, endinpos;
7918 PyObject *errorHandler = NULL, *exc = NULL;
7919 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007920 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007921
7922 e = s + size;
7923
7924 while (s < e) {
7925 ch = *s;
7926
7927 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7928 key = PyLong_FromLong((long)ch);
7929 if (key == NULL)
7930 goto onError;
7931
7932 item = PyObject_GetItem(mapping, key);
7933 Py_DECREF(key);
7934 if (item == NULL) {
7935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7936 /* No mapping found means: mapping is undefined. */
7937 PyErr_Clear();
7938 goto Undefined;
7939 } else
7940 goto onError;
7941 }
7942
7943 /* Apply mapping */
7944 if (item == Py_None)
7945 goto Undefined;
7946 if (PyLong_Check(item)) {
7947 long value = PyLong_AS_LONG(item);
7948 if (value == 0xFFFE)
7949 goto Undefined;
7950 if (value < 0 || value > MAX_UNICODE) {
7951 PyErr_Format(PyExc_TypeError,
7952 "character mapping must be in range(0x%lx)",
7953 (unsigned long)MAX_UNICODE + 1);
7954 goto onError;
7955 }
7956
7957 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7958 goto onError;
7959 }
7960 else if (PyUnicode_Check(item)) {
7961 if (PyUnicode_READY(item) == -1)
7962 goto onError;
7963 if (PyUnicode_GET_LENGTH(item) == 1) {
7964 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7965 if (value == 0xFFFE)
7966 goto Undefined;
7967 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7968 goto onError;
7969 }
7970 else {
7971 writer->overallocate = 1;
7972 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7973 goto onError;
7974 }
7975 }
7976 else {
7977 /* wrong return value */
7978 PyErr_SetString(PyExc_TypeError,
7979 "character mapping must return integer, None or str");
7980 goto onError;
7981 }
7982 Py_CLEAR(item);
7983 ++s;
7984 continue;
7985
7986Undefined:
7987 /* undefined mapping */
7988 Py_CLEAR(item);
7989 startinpos = s-starts;
7990 endinpos = startinpos+1;
7991 if (unicode_decode_call_errorhandler_writer(
7992 errors, &errorHandler,
7993 "charmap", "character maps to <undefined>",
7994 &starts, &e, &startinpos, &endinpos, &exc, &s,
7995 writer)) {
7996 goto onError;
7997 }
7998 }
7999 Py_XDECREF(errorHandler);
8000 Py_XDECREF(exc);
8001 return 0;
8002
8003onError:
8004 Py_XDECREF(item);
8005 Py_XDECREF(errorHandler);
8006 Py_XDECREF(exc);
8007 return -1;
8008}
8009
Alexander Belopolsky40018472011-02-26 01:02:56 +00008010PyObject *
8011PyUnicode_DecodeCharmap(const char *s,
8012 Py_ssize_t size,
8013 PyObject *mapping,
8014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008016 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008017
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 /* Default to Latin-1 */
8019 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008023 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008024 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008025 writer.min_length = size;
8026 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008028
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008029 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008030 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8031 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008032 }
8033 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008037 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008038
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008040 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 return NULL;
8042}
8043
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044/* Charmap encoding: the lookup table */
8045
Alexander Belopolsky40018472011-02-26 01:02:56 +00008046struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 PyObject_HEAD
8048 unsigned char level1[32];
8049 int count2, count3;
8050 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051};
8052
8053static PyObject*
8054encoding_map_size(PyObject *obj, PyObject* args)
8055{
8056 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059}
8060
8061static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 PyDoc_STR("Return the size (in bytes) of this object") },
8064 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065};
8066
8067static void
8068encoding_map_dealloc(PyObject* o)
8069{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008070 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071}
8072
8073static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 "EncodingMap", /*tp_name*/
8076 sizeof(struct encoding_map), /*tp_basicsize*/
8077 0, /*tp_itemsize*/
8078 /* methods */
8079 encoding_map_dealloc, /*tp_dealloc*/
8080 0, /*tp_print*/
8081 0, /*tp_getattr*/
8082 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008083 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 0, /*tp_repr*/
8085 0, /*tp_as_number*/
8086 0, /*tp_as_sequence*/
8087 0, /*tp_as_mapping*/
8088 0, /*tp_hash*/
8089 0, /*tp_call*/
8090 0, /*tp_str*/
8091 0, /*tp_getattro*/
8092 0, /*tp_setattro*/
8093 0, /*tp_as_buffer*/
8094 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8095 0, /*tp_doc*/
8096 0, /*tp_traverse*/
8097 0, /*tp_clear*/
8098 0, /*tp_richcompare*/
8099 0, /*tp_weaklistoffset*/
8100 0, /*tp_iter*/
8101 0, /*tp_iternext*/
8102 encoding_map_methods, /*tp_methods*/
8103 0, /*tp_members*/
8104 0, /*tp_getset*/
8105 0, /*tp_base*/
8106 0, /*tp_dict*/
8107 0, /*tp_descr_get*/
8108 0, /*tp_descr_set*/
8109 0, /*tp_dictoffset*/
8110 0, /*tp_init*/
8111 0, /*tp_alloc*/
8112 0, /*tp_new*/
8113 0, /*tp_free*/
8114 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115};
8116
8117PyObject*
8118PyUnicode_BuildEncodingMap(PyObject* string)
8119{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 PyObject *result;
8121 struct encoding_map *mresult;
8122 int i;
8123 int need_dict = 0;
8124 unsigned char level1[32];
8125 unsigned char level2[512];
8126 unsigned char *mlevel1, *mlevel2, *mlevel3;
8127 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 int kind;
8129 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008130 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008133 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 PyErr_BadArgument();
8135 return NULL;
8136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 kind = PyUnicode_KIND(string);
8138 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 length = PyUnicode_GET_LENGTH(string);
8140 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 memset(level1, 0xFF, sizeof level1);
8142 memset(level2, 0xFF, sizeof level2);
8143
8144 /* If there isn't a one-to-one mapping of NULL to \0,
8145 or if there are non-BMP characters, we need to use
8146 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008149 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 ch = PyUnicode_READ(kind, data, i);
8152 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 need_dict = 1;
8154 break;
8155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 /* unmapped character */
8158 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 l1 = ch >> 11;
8160 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (level1[l1] == 0xFF)
8162 level1[l1] = count2++;
8163 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 }
8166
8167 if (count2 >= 0xFF || count3 >= 0xFF)
8168 need_dict = 1;
8169
8170 if (need_dict) {
8171 PyObject *result = PyDict_New();
8172 PyObject *key, *value;
8173 if (!result)
8174 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008175 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008177 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178 if (!key || !value)
8179 goto failed1;
8180 if (PyDict_SetItem(result, key, value) == -1)
8181 goto failed1;
8182 Py_DECREF(key);
8183 Py_DECREF(value);
8184 }
8185 return result;
8186 failed1:
8187 Py_XDECREF(key);
8188 Py_XDECREF(value);
8189 Py_DECREF(result);
8190 return NULL;
8191 }
8192
8193 /* Create a three-level trie */
8194 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8195 16*count2 + 128*count3 - 1);
8196 if (!result)
8197 return PyErr_NoMemory();
8198 PyObject_Init(result, &EncodingMapType);
8199 mresult = (struct encoding_map*)result;
8200 mresult->count2 = count2;
8201 mresult->count3 = count3;
8202 mlevel1 = mresult->level1;
8203 mlevel2 = mresult->level23;
8204 mlevel3 = mresult->level23 + 16*count2;
8205 memcpy(mlevel1, level1, 32);
8206 memset(mlevel2, 0xFF, 16*count2);
8207 memset(mlevel3, 0, 128*count3);
8208 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008209 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008211 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8212 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 /* unmapped character */
8214 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 o1 = ch>>11;
8216 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 i2 = 16*mlevel1[o1] + o2;
8218 if (mlevel2[i2] == 0xFF)
8219 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 i3 = 128*mlevel2[i2] + o3;
8222 mlevel3[i3] = i;
8223 }
8224 return result;
8225}
8226
8227static int
Victor Stinner22168992011-11-20 17:09:18 +01008228encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229{
8230 struct encoding_map *map = (struct encoding_map*)mapping;
8231 int l1 = c>>11;
8232 int l2 = (c>>7) & 0xF;
8233 int l3 = c & 0x7F;
8234 int i;
8235
Victor Stinner22168992011-11-20 17:09:18 +01008236 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238 if (c == 0)
8239 return 0;
8240 /* level 1*/
8241 i = map->level1[l1];
8242 if (i == 0xFF) {
8243 return -1;
8244 }
8245 /* level 2*/
8246 i = map->level23[16*i+l2];
8247 if (i == 0xFF) {
8248 return -1;
8249 }
8250 /* level 3 */
8251 i = map->level23[16*map->count2 + 128*i + l3];
8252 if (i == 0) {
8253 return -1;
8254 }
8255 return i;
8256}
8257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258/* Lookup the character ch in the mapping. If the character
8259 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008260 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008261static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008262charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263{
Christian Heimes217cfd12007-12-02 14:31:20 +00008264 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 PyObject *x;
8266
8267 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 x = PyObject_GetItem(mapping, w);
8270 Py_DECREF(w);
8271 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8273 /* No mapping found means: mapping is undefined. */
8274 PyErr_Clear();
8275 x = Py_None;
8276 Py_INCREF(x);
8277 return x;
8278 } else
8279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008281 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008283 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 long value = PyLong_AS_LONG(x);
8285 if (value < 0 || value > 255) {
8286 PyErr_SetString(PyExc_TypeError,
8287 "character mapping must be in range(256)");
8288 Py_DECREF(x);
8289 return NULL;
8290 }
8291 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008293 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 /* wrong return value */
8297 PyErr_Format(PyExc_TypeError,
8298 "character mapping must return integer, bytes or None, not %.400s",
8299 x->ob_type->tp_name);
8300 Py_DECREF(x);
8301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 }
8303}
8304
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008306charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008308 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8309 /* exponentially overallocate to minimize reallocations */
8310 if (requiredsize < 2*outsize)
8311 requiredsize = 2*outsize;
8312 if (_PyBytes_Resize(outobj, requiredsize))
8313 return -1;
8314 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315}
8316
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008319} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008321 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 space is available. Return a new reference to the object that
8323 was put in the output buffer, or Py_None, if the mapping was undefined
8324 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008325 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008327charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 PyObject *rep;
8331 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008332 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333
Christian Heimes90aa7642007-12-19 02:45:37 +00008334 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 if (res == -1)
8338 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 if (outsize<requiredsize)
8340 if (charmapencode_resize(outobj, outpos, requiredsize))
8341 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008342 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 outstart[(*outpos)++] = (char)res;
8344 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 }
8346
8347 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 Py_DECREF(rep);
8352 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 if (PyLong_Check(rep)) {
8355 Py_ssize_t requiredsize = *outpos+1;
8356 if (outsize<requiredsize)
8357 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8358 Py_DECREF(rep);
8359 return enc_EXCEPTION;
8360 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008361 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 else {
8365 const char *repchars = PyBytes_AS_STRING(rep);
8366 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8367 Py_ssize_t requiredsize = *outpos+repsize;
8368 if (outsize<requiredsize)
8369 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8370 Py_DECREF(rep);
8371 return enc_EXCEPTION;
8372 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008373 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 memcpy(outstart + *outpos, repchars, repsize);
8375 *outpos += repsize;
8376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378 Py_DECREF(rep);
8379 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380}
8381
8382/* handle an error in PyUnicode_EncodeCharmap
8383 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008384static int
8385charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008388 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008389 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390{
8391 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008392 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008393 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008394 enum PyUnicode_Kind kind;
8395 void *data;
8396 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 Py_ssize_t collstartpos = *inpos;
8399 Py_ssize_t collendpos = *inpos+1;
8400 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 char *encoding = "charmap";
8402 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008403 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008405 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406
Benjamin Petersonbac79492012-01-14 13:34:47 -05008407 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008408 return -1;
8409 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 /* find all unencodable characters */
8411 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008413 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008414 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008415 val = encoding_map_lookup(ch, mapping);
8416 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 break;
8418 ++collendpos;
8419 continue;
8420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008422 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8423 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 if (rep==NULL)
8425 return -1;
8426 else if (rep!=Py_None) {
8427 Py_DECREF(rep);
8428 break;
8429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008430 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
8433 /* cache callback name lookup
8434 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008435 if (*error_handler == _Py_ERROR_UNKNOWN)
8436 *error_handler = get_error_handler(errors);
8437
8438 switch (*error_handler) {
8439 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008440 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008442
8443 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 x = charmapencode_output('?', mapping, res, respos);
8446 if (x==enc_EXCEPTION) {
8447 return -1;
8448 }
8449 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008450 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return -1;
8452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 }
8454 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008455 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008456 *inpos = collendpos;
8457 break;
Victor Stinner50149202015-09-22 00:26:54 +02008458
8459 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 /* generate replacement (temporarily (mis)uses p) */
8461 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 char buffer[2+29+1+1];
8463 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008464 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 for (cp = buffer; *cp; ++cp) {
8466 x = charmapencode_output(*cp, mapping, res, respos);
8467 if (x==enc_EXCEPTION)
8468 return -1;
8469 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008470 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 return -1;
8472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 }
8474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 *inpos = collendpos;
8476 break;
Victor Stinner50149202015-09-22 00:26:54 +02008477
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 default:
Victor Stinner50149202015-09-22 00:26:54 +02008479 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008480 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008484 if (PyBytes_Check(repunicode)) {
8485 /* Directly copy bytes result to output. */
8486 Py_ssize_t outsize = PyBytes_Size(*res);
8487 Py_ssize_t requiredsize;
8488 repsize = PyBytes_Size(repunicode);
8489 requiredsize = *respos + repsize;
8490 if (requiredsize > outsize)
8491 /* Make room for all additional bytes. */
8492 if (charmapencode_resize(res, respos, requiredsize)) {
8493 Py_DECREF(repunicode);
8494 return -1;
8495 }
8496 memcpy(PyBytes_AsString(*res) + *respos,
8497 PyBytes_AsString(repunicode), repsize);
8498 *respos += repsize;
8499 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008500 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008501 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008503 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008504 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008505 Py_DECREF(repunicode);
8506 return -1;
8507 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008508 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008509 data = PyUnicode_DATA(repunicode);
8510 kind = PyUnicode_KIND(repunicode);
8511 for (index = 0; index < repsize; index++) {
8512 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8513 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008515 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
8517 }
8518 else if (x==enc_FAILED) {
8519 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008520 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 return -1;
8522 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008523 }
8524 *inpos = newpos;
8525 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 }
8527 return 0;
8528}
8529
Alexander Belopolsky40018472011-02-26 01:02:56 +00008530PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008531_PyUnicode_EncodeCharmap(PyObject *unicode,
8532 PyObject *mapping,
8533 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 /* output object */
8536 PyObject *res = NULL;
8537 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008539 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008541 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008542 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008544 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008545 void *data;
8546 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547
Benjamin Petersonbac79492012-01-14 13:34:47 -05008548 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008549 return NULL;
8550 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008551 data = PyUnicode_DATA(unicode);
8552 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 /* Default to Latin-1 */
8555 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 /* allocate enough for a simple encoding without
8559 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008560 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 if (res == NULL)
8562 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008563 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008567 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008569 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 if (x==enc_EXCEPTION) /* error */
8571 goto onError;
8572 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008573 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008575 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 &res, &respos)) {
8577 goto onError;
8578 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008579 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 else
8581 /* done with this character => adjust input position */
8582 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008586 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008587 if (_PyBytes_Resize(&res, respos) < 0)
8588 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008591 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 return res;
8593
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 Py_XDECREF(res);
8596 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008597 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 return NULL;
8599}
8600
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601/* Deprecated */
8602PyObject *
8603PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8604 Py_ssize_t size,
8605 PyObject *mapping,
8606 const char *errors)
8607{
8608 PyObject *result;
8609 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8610 if (unicode == NULL)
8611 return NULL;
8612 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8613 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008614 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008615}
8616
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617PyObject *
8618PyUnicode_AsCharmapString(PyObject *unicode,
8619 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620{
8621 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 PyErr_BadArgument();
8623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008625 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626}
8627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629static void
8630make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008632 Py_ssize_t startpos, Py_ssize_t endpos,
8633 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 *exceptionObject = _PyUnicodeTranslateError_Create(
8637 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 }
8639 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8641 goto onError;
8642 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8643 goto onError;
8644 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8645 goto onError;
8646 return;
8647 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008648 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 }
8650}
8651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652/* error handling callback helper:
8653 build arguments, call the callback and check the arguments,
8654 put the result into newpos and return the replacement string, which
8655 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008656static PyObject *
8657unicode_translate_call_errorhandler(const char *errors,
8658 PyObject **errorHandler,
8659 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008661 Py_ssize_t startpos, Py_ssize_t endpos,
8662 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008664 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008666 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 PyObject *restuple;
8668 PyObject *resunicode;
8669
8670 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 }
8675
8676 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680
8681 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008686 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 Py_DECREF(restuple);
8688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 }
8690 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 &resunicode, &i_newpos)) {
8692 Py_DECREF(restuple);
8693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 else
8698 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008700 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 Py_DECREF(restuple);
8702 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704 Py_INCREF(resunicode);
8705 Py_DECREF(restuple);
8706 return resunicode;
8707}
8708
8709/* Lookup the character ch in the mapping and put the result in result,
8710 which must be decrefed by the caller.
8711 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008712static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714{
Christian Heimes217cfd12007-12-02 14:31:20 +00008715 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 PyObject *x;
8717
8718 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720 x = PyObject_GetItem(mapping, w);
8721 Py_DECREF(w);
8722 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8724 /* No mapping found means: use 1:1 mapping. */
8725 PyErr_Clear();
8726 *result = NULL;
8727 return 0;
8728 } else
8729 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 }
8731 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 *result = x;
8733 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008734 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008735 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008737 if (value < 0 || value > MAX_UNICODE) {
8738 PyErr_Format(PyExc_ValueError,
8739 "character mapping must be in range(0x%x)",
8740 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 Py_DECREF(x);
8742 return -1;
8743 }
8744 *result = x;
8745 return 0;
8746 }
8747 else if (PyUnicode_Check(x)) {
8748 *result = x;
8749 return 0;
8750 }
8751 else {
8752 /* wrong return value */
8753 PyErr_SetString(PyExc_TypeError,
8754 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008755 Py_DECREF(x);
8756 return -1;
8757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758}
Victor Stinner1194ea02014-04-04 19:37:40 +02008759
8760/* lookup the character, write the result into the writer.
8761 Return 1 if the result was written into the writer, return 0 if the mapping
8762 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008764charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8765 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766{
Victor Stinner1194ea02014-04-04 19:37:40 +02008767 PyObject *item;
8768
8769 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008771
8772 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008777 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008778 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008779
8780 if (item == Py_None) {
8781 Py_DECREF(item);
8782 return 0;
8783 }
8784
8785 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008786 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8787 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8788 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008789 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8790 Py_DECREF(item);
8791 return -1;
8792 }
8793 Py_DECREF(item);
8794 return 1;
8795 }
8796
8797 if (!PyUnicode_Check(item)) {
8798 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008800 }
8801
8802 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8803 Py_DECREF(item);
8804 return -1;
8805 }
8806
8807 Py_DECREF(item);
8808 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008809}
8810
Victor Stinner89a76ab2014-04-05 11:44:04 +02008811static int
8812unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8813 Py_UCS1 *translate)
8814{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008815 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 int ret = 0;
8817
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 if (charmaptranslate_lookup(ch, mapping, &item)) {
8819 return -1;
8820 }
8821
8822 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008823 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008824 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008826 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 /* not found => default to 1:1 mapping */
8828 translate[ch] = ch;
8829 return 1;
8830 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008831 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008832 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008833 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8834 used it */
8835 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008836 /* invalid character or character outside ASCII:
8837 skip the fast translate */
8838 goto exit;
8839 }
8840 translate[ch] = (Py_UCS1)replace;
8841 }
8842 else if (PyUnicode_Check(item)) {
8843 Py_UCS4 replace;
8844
8845 if (PyUnicode_READY(item) == -1) {
8846 Py_DECREF(item);
8847 return -1;
8848 }
8849 if (PyUnicode_GET_LENGTH(item) != 1)
8850 goto exit;
8851
8852 replace = PyUnicode_READ_CHAR(item, 0);
8853 if (replace > 127)
8854 goto exit;
8855 translate[ch] = (Py_UCS1)replace;
8856 }
8857 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008858 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 goto exit;
8860 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 ret = 1;
8862
Benjamin Peterson1365de72014-04-07 20:15:41 -04008863 exit:
8864 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 return ret;
8866}
8867
8868/* Fast path for ascii => ascii translation. Return 1 if the whole string
8869 was translated into writer, return 0 if the input string was partially
8870 translated into writer, raise an exception and return -1 on error. */
8871static int
8872unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008873 _PyUnicodeWriter *writer, int ignore,
8874 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875{
Victor Stinner872b2912014-04-05 14:27:07 +02008876 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877 Py_ssize_t len;
8878 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008879 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880
Victor Stinner89a76ab2014-04-05 11:44:04 +02008881 len = PyUnicode_GET_LENGTH(input);
8882
Victor Stinner872b2912014-04-05 14:27:07 +02008883 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884
8885 in = PyUnicode_1BYTE_DATA(input);
8886 end = in + len;
8887
8888 assert(PyUnicode_IS_ASCII(writer->buffer));
8889 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8890 out = PyUnicode_1BYTE_DATA(writer->buffer);
8891
Victor Stinner872b2912014-04-05 14:27:07 +02008892 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008894 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008896 int translate = unicode_fast_translate_lookup(mapping, ch,
8897 ascii_table);
8898 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008900 if (translate == 0)
8901 goto exit;
8902 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008903 }
Victor Stinner872b2912014-04-05 14:27:07 +02008904 if (ch2 == 0xfe) {
8905 if (ignore)
8906 continue;
8907 goto exit;
8908 }
8909 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008911 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912 }
Victor Stinner872b2912014-04-05 14:27:07 +02008913 res = 1;
8914
8915exit:
8916 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008917 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008918 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919}
8920
Victor Stinner3222da22015-10-01 22:07:32 +02008921static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922_PyUnicode_TranslateCharmap(PyObject *input,
8923 PyObject *mapping,
8924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008927 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 Py_ssize_t size, i;
8929 int kind;
8930 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 _PyUnicodeWriter writer;
8932 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008933 char *reason = "character maps to <undefined>";
8934 PyObject *errorHandler = NULL;
8935 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008936 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008937 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 PyErr_BadArgument();
8941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 if (PyUnicode_READY(input) == -1)
8945 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008946 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 kind = PyUnicode_KIND(input);
8948 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008950 if (size == 0)
8951 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008953 /* allocate enough for a simple 1:1 translation without
8954 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 _PyUnicodeWriter_Init(&writer);
8956 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958
Victor Stinner872b2912014-04-05 14:27:07 +02008959 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8960
Victor Stinner33798672016-03-01 21:59:58 +01008961 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008962 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008963 if (PyUnicode_IS_ASCII(input)) {
8964 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8965 if (res < 0) {
8966 _PyUnicodeWriter_Dealloc(&writer);
8967 return NULL;
8968 }
8969 if (res == 1)
8970 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008971 }
Victor Stinner33798672016-03-01 21:59:58 +01008972 else {
8973 i = 0;
8974 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008978 int translate;
8979 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8980 Py_ssize_t newpos;
8981 /* startpos for collecting untranslatable chars */
8982 Py_ssize_t collstart;
8983 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 ch = PyUnicode_READ(kind, data, i);
8987 translate = charmaptranslate_output(ch, mapping, &writer);
8988 if (translate < 0)
8989 goto onError;
8990
8991 if (translate != 0) {
8992 /* it worked => adjust input pointer */
8993 ++i;
8994 continue;
8995 }
8996
8997 /* untranslatable character */
8998 collstart = i;
8999 collend = i+1;
9000
9001 /* find all untranslatable characters */
9002 while (collend < size) {
9003 PyObject *x;
9004 ch = PyUnicode_READ(kind, data, collend);
9005 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009006 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 Py_XDECREF(x);
9008 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009010 ++collend;
9011 }
9012
9013 if (ignore) {
9014 i = collend;
9015 }
9016 else {
9017 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9018 reason, input, &exc,
9019 collstart, collend, &newpos);
9020 if (repunicode == NULL)
9021 goto onError;
9022 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009025 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 Py_DECREF(repunicode);
9027 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009028 }
9029 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030 Py_XDECREF(exc);
9031 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009032 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009036 Py_XDECREF(exc);
9037 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 return NULL;
9039}
9040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041/* Deprecated. Use PyUnicode_Translate instead. */
9042PyObject *
9043PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9044 Py_ssize_t size,
9045 PyObject *mapping,
9046 const char *errors)
9047{
Christian Heimes5f520f42012-09-11 14:03:25 +02009048 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9050 if (!unicode)
9051 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009052 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9053 Py_DECREF(unicode);
9054 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055}
9056
Alexander Belopolsky40018472011-02-26 01:02:56 +00009057PyObject *
9058PyUnicode_Translate(PyObject *str,
9059 PyObject *mapping,
9060 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009062 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009063 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009064 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065}
Tim Petersced69f82003-09-16 20:30:58 +00009066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009068fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069{
9070 /* No need to call PyUnicode_READY(self) because this function is only
9071 called as a callback from fixup() which does it already. */
9072 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9073 const int kind = PyUnicode_KIND(self);
9074 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009075 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009076 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 Py_ssize_t i;
9078
9079 for (i = 0; i < len; ++i) {
9080 ch = PyUnicode_READ(kind, data, i);
9081 fixed = 0;
9082 if (ch > 127) {
9083 if (Py_UNICODE_ISSPACE(ch))
9084 fixed = ' ';
9085 else {
9086 const int decimal = Py_UNICODE_TODECIMAL(ch);
9087 if (decimal >= 0)
9088 fixed = '0' + decimal;
9089 }
9090 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009091 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009092 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 PyUnicode_WRITE(kind, data, i, fixed);
9094 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009095 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009096 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 }
9099
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009100 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101}
9102
9103PyObject *
9104_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9105{
9106 if (!PyUnicode_Check(unicode)) {
9107 PyErr_BadInternalCall();
9108 return NULL;
9109 }
9110 if (PyUnicode_READY(unicode) == -1)
9111 return NULL;
9112 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9113 /* If the string is already ASCII, just return the same string */
9114 Py_INCREF(unicode);
9115 return unicode;
9116 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009117 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118}
9119
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009120PyObject *
9121PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9122 Py_ssize_t length)
9123{
Victor Stinnerf0124502011-11-21 23:12:56 +01009124 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009125 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009126 Py_UCS4 maxchar;
9127 enum PyUnicode_Kind kind;
9128 void *data;
9129
Victor Stinner99d7ad02012-02-22 13:37:39 +01009130 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009132 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009133 if (ch > 127) {
9134 int decimal = Py_UNICODE_TODECIMAL(ch);
9135 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009136 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009137 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009138 }
9139 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009140
9141 /* Copy to a new string */
9142 decimal = PyUnicode_New(length, maxchar);
9143 if (decimal == NULL)
9144 return decimal;
9145 kind = PyUnicode_KIND(decimal);
9146 data = PyUnicode_DATA(decimal);
9147 /* Iterate over code points */
9148 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009149 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009150 if (ch > 127) {
9151 int decimal = Py_UNICODE_TODECIMAL(ch);
9152 if (decimal >= 0)
9153 ch = '0' + decimal;
9154 }
9155 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009157 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009158}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009159/* --- Decimal Encoder ---------------------------------------------------- */
9160
Alexander Belopolsky40018472011-02-26 01:02:56 +00009161int
9162PyUnicode_EncodeDecimal(Py_UNICODE *s,
9163 Py_ssize_t length,
9164 char *output,
9165 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009166{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009167 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009168 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009169 enum PyUnicode_Kind kind;
9170 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009171
9172 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 PyErr_BadArgument();
9174 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009175 }
9176
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 unicode = PyUnicode_FromUnicode(s, length);
9178 if (unicode == NULL)
9179 return -1;
9180
Benjamin Petersonbac79492012-01-14 13:34:47 -05009181 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009182 Py_DECREF(unicode);
9183 return -1;
9184 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009185 kind = PyUnicode_KIND(unicode);
9186 data = PyUnicode_DATA(unicode);
9187
Victor Stinnerb84d7232011-11-22 01:50:07 +01009188 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009189 PyObject *exc;
9190 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009192 Py_ssize_t startpos;
9193
9194 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009195
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009197 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009200 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 decimal = Py_UNICODE_TODECIMAL(ch);
9202 if (decimal >= 0) {
9203 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009204 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 continue;
9206 }
9207 if (0 < ch && ch < 256) {
9208 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009209 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 continue;
9211 }
Victor Stinner6345be92011-11-25 20:09:01 +01009212
Victor Stinner42bf7752011-11-21 22:52:58 +01009213 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009214 exc = NULL;
9215 raise_encode_exception(&exc, "decimal", unicode,
9216 startpos, startpos+1,
9217 "invalid decimal Unicode string");
9218 Py_XDECREF(exc);
9219 Py_DECREF(unicode);
9220 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009221 }
9222 /* 0-terminate the output string */
9223 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009224 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009225 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009226}
9227
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228/* --- Helpers ------------------------------------------------------------ */
9229
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009230/* helper macro to fixup start/end slice values */
9231#define ADJUST_INDICES(start, end, len) \
9232 if (end > len) \
9233 end = len; \
9234 else if (end < 0) { \
9235 end += len; \
9236 if (end < 0) \
9237 end = 0; \
9238 } \
9239 if (start < 0) { \
9240 start += len; \
9241 if (start < 0) \
9242 start = 0; \
9243 }
9244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009246any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009248 Py_ssize_t end,
9249 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 void *buf1, *buf2;
9253 Py_ssize_t len1, len2, result;
9254
9255 kind1 = PyUnicode_KIND(s1);
9256 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009257 if (kind1 < kind2)
9258 return -1;
9259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260 len1 = PyUnicode_GET_LENGTH(s1);
9261 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009262 ADJUST_INDICES(start, end, len1);
9263 if (end - start < len2)
9264 return -1;
9265
9266 buf1 = PyUnicode_DATA(s1);
9267 buf2 = PyUnicode_DATA(s2);
9268 if (len2 == 1) {
9269 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9270 result = findchar((const char *)buf1 + kind1*start,
9271 kind1, end - start, ch, direction);
9272 if (result == -1)
9273 return -1;
9274 else
9275 return start + result;
9276 }
9277
9278 if (kind2 != kind1) {
9279 buf2 = _PyUnicode_AsKind(s2, kind1);
9280 if (!buf2)
9281 return -2;
9282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283
Victor Stinner794d5672011-10-10 03:21:36 +02009284 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009285 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009286 case PyUnicode_1BYTE_KIND:
9287 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9288 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9289 else
9290 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 case PyUnicode_2BYTE_KIND:
9293 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9294 break;
9295 case PyUnicode_4BYTE_KIND:
9296 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9297 break;
9298 default:
9299 assert(0); result = -2;
9300 }
9301 }
9302 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009303 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009304 case PyUnicode_1BYTE_KIND:
9305 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9306 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 else
9308 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 case PyUnicode_2BYTE_KIND:
9311 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9312 break;
9313 case PyUnicode_4BYTE_KIND:
9314 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9315 break;
9316 default:
9317 assert(0); result = -2;
9318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 }
9320
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009321 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 PyMem_Free(buf2);
9323
9324 return result;
9325}
9326
9327Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009328_PyUnicode_InsertThousandsGrouping(
9329 PyObject *unicode, Py_ssize_t index,
9330 Py_ssize_t n_buffer,
9331 void *digits, Py_ssize_t n_digits,
9332 Py_ssize_t min_width,
9333 const char *grouping, PyObject *thousands_sep,
9334 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335{
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009337 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 Py_ssize_t thousands_sep_len;
9339 Py_ssize_t len;
9340
9341 if (unicode != NULL) {
9342 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009343 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009344 }
9345 else {
9346 kind = PyUnicode_1BYTE_KIND;
9347 data = NULL;
9348 }
9349 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9350 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9351 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9352 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009353 if (thousands_sep_kind < kind) {
9354 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9355 if (!thousands_sep_data)
9356 return -1;
9357 }
9358 else {
9359 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9360 if (!data)
9361 return -1;
9362 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 }
9364
Benjamin Petersonead6b532011-12-20 17:23:42 -06009365 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009367 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009369 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009371 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009372 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009374 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009385 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009386 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009387 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009388 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009389 break;
9390 default:
9391 assert(0);
9392 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009394 if (unicode != NULL && thousands_sep_kind != kind) {
9395 if (thousands_sep_kind < kind)
9396 PyMem_Free(thousands_sep_data);
9397 else
9398 PyMem_Free(data);
9399 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009400 if (unicode == NULL) {
9401 *maxchar = 127;
9402 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009403 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009404 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009405 }
9406 }
9407 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408}
9409
9410
Alexander Belopolsky40018472011-02-26 01:02:56 +00009411Py_ssize_t
9412PyUnicode_Count(PyObject *str,
9413 PyObject *substr,
9414 Py_ssize_t start,
9415 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009417 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009418 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 void *buf1 = NULL, *buf2 = NULL;
9420 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009421
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009422 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009424
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009425 kind1 = PyUnicode_KIND(str);
9426 kind2 = PyUnicode_KIND(substr);
9427 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 len1 = PyUnicode_GET_LENGTH(str);
9431 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009435
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009436 buf1 = PyUnicode_DATA(str);
9437 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009438 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009439 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009440 if (!buf2)
9441 goto onError;
9442 }
9443
9444 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009446 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009447 result = asciilib_count(
9448 ((Py_UCS1*)buf1) + start, end - start,
9449 buf2, len2, PY_SSIZE_T_MAX
9450 );
9451 else
9452 result = ucs1lib_count(
9453 ((Py_UCS1*)buf1) + start, end - start,
9454 buf2, len2, PY_SSIZE_T_MAX
9455 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 break;
9457 case PyUnicode_2BYTE_KIND:
9458 result = ucs2lib_count(
9459 ((Py_UCS2*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 break;
9463 case PyUnicode_4BYTE_KIND:
9464 result = ucs4lib_count(
9465 ((Py_UCS4*)buf1) + start, end - start,
9466 buf2, len2, PY_SSIZE_T_MAX
9467 );
9468 break;
9469 default:
9470 assert(0); result = 0;
9471 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009472
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009473 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 PyMem_Free(buf2);
9475
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009478 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 PyMem_Free(buf2);
9480 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481}
9482
Alexander Belopolsky40018472011-02-26 01:02:56 +00009483Py_ssize_t
9484PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009485 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009486 Py_ssize_t start,
9487 Py_ssize_t end,
9488 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009490 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009492
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009493 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494}
9495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496Py_ssize_t
9497PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9498 Py_ssize_t start, Py_ssize_t end,
9499 int direction)
9500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009502 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 if (PyUnicode_READY(str) == -1)
9504 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009505 if (start < 0 || end < 0) {
9506 PyErr_SetString(PyExc_IndexError, "string index out of range");
9507 return -2;
9508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 if (end > PyUnicode_GET_LENGTH(str))
9510 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009511 if (start >= end)
9512 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009514 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9515 kind, end-start, ch, direction);
9516 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009518 else
9519 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520}
9521
Alexander Belopolsky40018472011-02-26 01:02:56 +00009522static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009523tailmatch(PyObject *self,
9524 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009525 Py_ssize_t start,
9526 Py_ssize_t end,
9527 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 int kind_self;
9530 int kind_sub;
9531 void *data_self;
9532 void *data_sub;
9533 Py_ssize_t offset;
9534 Py_ssize_t i;
9535 Py_ssize_t end_sub;
9536
9537 if (PyUnicode_READY(self) == -1 ||
9538 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009539 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9542 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009546 if (PyUnicode_GET_LENGTH(substring) == 0)
9547 return 1;
9548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 kind_self = PyUnicode_KIND(self);
9550 data_self = PyUnicode_DATA(self);
9551 kind_sub = PyUnicode_KIND(substring);
9552 data_sub = PyUnicode_DATA(substring);
9553 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9554
9555 if (direction > 0)
9556 offset = end;
9557 else
9558 offset = start;
9559
9560 if (PyUnicode_READ(kind_self, data_self, offset) ==
9561 PyUnicode_READ(kind_sub, data_sub, 0) &&
9562 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9563 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9564 /* If both are of the same kind, memcmp is sufficient */
9565 if (kind_self == kind_sub) {
9566 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009567 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 data_sub,
9569 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009570 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009572 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 else {
9574 /* We do not need to compare 0 and len(substring)-1 because
9575 the if statement above ensured already that they are equal
9576 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 for (i = 1; i < end_sub; ++i) {
9578 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9579 PyUnicode_READ(kind_sub, data_sub, i))
9580 return 0;
9581 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584 }
9585
9586 return 0;
9587}
9588
Alexander Belopolsky40018472011-02-26 01:02:56 +00009589Py_ssize_t
9590PyUnicode_Tailmatch(PyObject *str,
9591 PyObject *substr,
9592 Py_ssize_t start,
9593 Py_ssize_t end,
9594 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009596 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009598
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009599 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600}
9601
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602/* Apply fixfct filter to the Unicode object self and return a
9603 reference to the modified object */
9604
Alexander Belopolsky40018472011-02-26 01:02:56 +00009605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009606fixup(PyObject *self,
9607 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 PyObject *u;
9610 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009611 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009613 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009616 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 /* fix functions return the new maximum character in a string,
9619 if the kind of the resulting unicode object does not change,
9620 everything is fine. Otherwise we need to change the string kind
9621 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009622 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009623
9624 if (maxchar_new == 0) {
9625 /* no changes */;
9626 if (PyUnicode_CheckExact(self)) {
9627 Py_DECREF(u);
9628 Py_INCREF(self);
9629 return self;
9630 }
9631 else
9632 return u;
9633 }
9634
Victor Stinnere6abb482012-05-02 01:15:40 +02009635 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636
Victor Stinnereaab6042011-12-11 22:22:39 +01009637 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009639
9640 /* In case the maximum character changed, we need to
9641 convert the string to the new category. */
9642 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9643 if (v == NULL) {
9644 Py_DECREF(u);
9645 return NULL;
9646 }
9647 if (maxchar_new > maxchar_old) {
9648 /* If the maxchar increased so that the kind changed, not all
9649 characters are representable anymore and we need to fix the
9650 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009651 _PyUnicode_FastCopyCharacters(v, 0,
9652 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009653 maxchar_old = fixfct(v);
9654 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 }
9656 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009657 _PyUnicode_FastCopyCharacters(v, 0,
9658 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009660 Py_DECREF(u);
9661 assert(_PyUnicode_CheckConsistency(v, 1));
9662 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663}
9664
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665static PyObject *
9666ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9669 char *resdata, *data = PyUnicode_DATA(self);
9670 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009671
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 res = PyUnicode_New(len, 127);
9673 if (res == NULL)
9674 return NULL;
9675 resdata = PyUnicode_DATA(res);
9676 if (lower)
9677 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 _Py_bytes_upper(resdata, data, len);
9680 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681}
9682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 Py_ssize_t j;
9687 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009688 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009690
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9692
9693 where ! is a negation and \p{xxx} is a character with property xxx.
9694 */
9695 for (j = i - 1; j >= 0; j--) {
9696 c = PyUnicode_READ(kind, data, j);
9697 if (!_PyUnicode_IsCaseIgnorable(c))
9698 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9701 if (final_sigma) {
9702 for (j = i + 1; j < length; j++) {
9703 c = PyUnicode_READ(kind, data, j);
9704 if (!_PyUnicode_IsCaseIgnorable(c))
9705 break;
9706 }
9707 final_sigma = j == length || !_PyUnicode_IsCased(c);
9708 }
9709 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710}
9711
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712static int
9713lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9714 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 /* Obscure special case. */
9717 if (c == 0x3A3) {
9718 mapped[0] = handle_capital_sigma(kind, data, length, i);
9719 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722}
9723
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724static Py_ssize_t
9725do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 Py_ssize_t i, k = 0;
9728 int n_res, j;
9729 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009730
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 c = PyUnicode_READ(kind, data, 0);
9732 n_res = _PyUnicode_ToUpperFull(c, mapped);
9733 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009734 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009735 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737 for (i = 1; i < length; i++) {
9738 c = PyUnicode_READ(kind, data, i);
9739 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9740 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009741 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009743 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009744 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009745 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746}
9747
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009748static Py_ssize_t
9749do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9750 Py_ssize_t i, k = 0;
9751
9752 for (i = 0; i < length; i++) {
9753 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9754 int n_res, j;
9755 if (Py_UNICODE_ISUPPER(c)) {
9756 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9757 }
9758 else if (Py_UNICODE_ISLOWER(c)) {
9759 n_res = _PyUnicode_ToUpperFull(c, mapped);
9760 }
9761 else {
9762 n_res = 1;
9763 mapped[0] = c;
9764 }
9765 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009766 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009767 res[k++] = mapped[j];
9768 }
9769 }
9770 return k;
9771}
9772
9773static Py_ssize_t
9774do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9775 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777 Py_ssize_t i, k = 0;
9778
9779 for (i = 0; i < length; i++) {
9780 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9781 int n_res, j;
9782 if (lower)
9783 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9784 else
9785 n_res = _PyUnicode_ToUpperFull(c, mapped);
9786 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009787 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788 res[k++] = mapped[j];
9789 }
9790 }
9791 return k;
9792}
9793
9794static Py_ssize_t
9795do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9796{
9797 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9798}
9799
9800static Py_ssize_t
9801do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9802{
9803 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9804}
9805
Benjamin Petersone51757f2012-01-12 21:10:29 -05009806static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009807do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9808{
9809 Py_ssize_t i, k = 0;
9810
9811 for (i = 0; i < length; i++) {
9812 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9813 Py_UCS4 mapped[3];
9814 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9815 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009816 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009817 res[k++] = mapped[j];
9818 }
9819 }
9820 return k;
9821}
9822
9823static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009824do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9825{
9826 Py_ssize_t i, k = 0;
9827 int previous_is_cased;
9828
9829 previous_is_cased = 0;
9830 for (i = 0; i < length; i++) {
9831 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9832 Py_UCS4 mapped[3];
9833 int n_res, j;
9834
9835 if (previous_is_cased)
9836 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9837 else
9838 n_res = _PyUnicode_ToTitleFull(c, mapped);
9839
9840 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009841 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009842 res[k++] = mapped[j];
9843 }
9844
9845 previous_is_cased = _PyUnicode_IsCased(c);
9846 }
9847 return k;
9848}
9849
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009850static PyObject *
9851case_operation(PyObject *self,
9852 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9853{
9854 PyObject *res = NULL;
9855 Py_ssize_t length, newlength = 0;
9856 int kind, outkind;
9857 void *data, *outdata;
9858 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9859
Benjamin Petersoneea48462012-01-16 14:28:50 -05009860 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861
9862 kind = PyUnicode_KIND(self);
9863 data = PyUnicode_DATA(self);
9864 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009865 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009866 PyErr_SetString(PyExc_OverflowError, "string is too long");
9867 return NULL;
9868 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009869 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870 if (tmp == NULL)
9871 return PyErr_NoMemory();
9872 newlength = perform(kind, data, length, tmp, &maxchar);
9873 res = PyUnicode_New(newlength, maxchar);
9874 if (res == NULL)
9875 goto leave;
9876 tmpend = tmp + newlength;
9877 outdata = PyUnicode_DATA(res);
9878 outkind = PyUnicode_KIND(res);
9879 switch (outkind) {
9880 case PyUnicode_1BYTE_KIND:
9881 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9882 break;
9883 case PyUnicode_2BYTE_KIND:
9884 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9885 break;
9886 case PyUnicode_4BYTE_KIND:
9887 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9888 break;
9889 default:
9890 assert(0);
9891 break;
9892 }
9893 leave:
9894 PyMem_FREE(tmp);
9895 return res;
9896}
9897
Tim Peters8ce9f162004-08-27 01:49:32 +00009898PyObject *
9899PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009902 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009905 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9906 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009907 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009909 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009911 int use_memcpy;
9912 unsigned char *res_data = NULL, *sep_data = NULL;
9913 PyObject *last_obj;
9914 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009916 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009917 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009919 }
9920
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009921 /* NOTE: the following code can't call back into Python code,
9922 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009923 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 seqlen = PySequence_Fast_GET_SIZE(fseq);
9926 /* If empty sequence, return u"". */
9927 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009928 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009929 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009930 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009931
Tim Peters05eba1f2004-08-27 21:32:02 +00009932 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009933 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009934 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009935 if (seqlen == 1) {
9936 if (PyUnicode_CheckExact(items[0])) {
9937 res = items[0];
9938 Py_INCREF(res);
9939 Py_DECREF(fseq);
9940 return res;
9941 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009942 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009943 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009944 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009945 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009946 /* Set up sep and seplen */
9947 if (separator == NULL) {
9948 /* fall back to a blank space separator */
9949 sep = PyUnicode_FromOrdinal(' ');
9950 if (!sep)
9951 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009952 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009953 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009954 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009955 else {
9956 if (!PyUnicode_Check(separator)) {
9957 PyErr_Format(PyExc_TypeError,
9958 "separator: expected str instance,"
9959 " %.80s found",
9960 Py_TYPE(separator)->tp_name);
9961 goto onError;
9962 }
9963 if (PyUnicode_READY(separator))
9964 goto onError;
9965 sep = separator;
9966 seplen = PyUnicode_GET_LENGTH(separator);
9967 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9968 /* inc refcount to keep this code path symmetric with the
9969 above case of a blank separator */
9970 Py_INCREF(sep);
9971 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009972 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009973 }
9974
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009975 /* There are at least two things to join, or else we have a subclass
9976 * of str in the sequence.
9977 * Do a pre-pass to figure out the total amount of space we'll
9978 * need (sz), and see whether all argument are strings.
9979 */
9980 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009981#ifdef Py_DEBUG
9982 use_memcpy = 0;
9983#else
9984 use_memcpy = 1;
9985#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 for (i = 0; i < seqlen; i++) {
9987 const Py_ssize_t old_sz = sz;
9988 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 if (!PyUnicode_Check(item)) {
9990 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009991 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 " %.80s found",
9993 i, Py_TYPE(item)->tp_name);
9994 goto onError;
9995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 if (PyUnicode_READY(item) == -1)
9997 goto onError;
9998 sz += PyUnicode_GET_LENGTH(item);
9999 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010000 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 if (i != 0)
10002 sz += seplen;
10003 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
10004 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 goto onError;
10007 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 if (use_memcpy && last_obj != NULL) {
10009 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10010 use_memcpy = 0;
10011 }
10012 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013 }
Tim Petersced69f82003-09-16 20:30:58 +000010014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010016 if (res == NULL)
10017 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010018
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010019 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010020#ifdef Py_DEBUG
10021 use_memcpy = 0;
10022#else
10023 if (use_memcpy) {
10024 res_data = PyUnicode_1BYTE_DATA(res);
10025 kind = PyUnicode_KIND(res);
10026 if (seplen != 0)
10027 sep_data = PyUnicode_1BYTE_DATA(sep);
10028 }
10029#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010030 if (use_memcpy) {
10031 for (i = 0; i < seqlen; ++i) {
10032 Py_ssize_t itemlen;
10033 item = items[i];
10034
10035 /* Copy item, and maybe the separator. */
10036 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010037 Py_MEMCPY(res_data,
10038 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010039 kind * seplen);
10040 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010041 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010042
10043 itemlen = PyUnicode_GET_LENGTH(item);
10044 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 Py_MEMCPY(res_data,
10046 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010047 kind * itemlen);
10048 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010049 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010050 }
10051 assert(res_data == PyUnicode_1BYTE_DATA(res)
10052 + kind * PyUnicode_GET_LENGTH(res));
10053 }
10054 else {
10055 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10056 Py_ssize_t itemlen;
10057 item = items[i];
10058
10059 /* Copy item, and maybe the separator. */
10060 if (i && seplen != 0) {
10061 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10062 res_offset += seplen;
10063 }
10064
10065 itemlen = PyUnicode_GET_LENGTH(item);
10066 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010067 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010068 res_offset += itemlen;
10069 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010070 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010071 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010072 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010073
Tim Peters05eba1f2004-08-27 21:32:02 +000010074 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010076 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010080 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010082 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083 return NULL;
10084}
10085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086#define FILL(kind, data, value, start, length) \
10087 do { \
10088 Py_ssize_t i_ = 0; \
10089 assert(kind != PyUnicode_WCHAR_KIND); \
10090 switch ((kind)) { \
10091 case PyUnicode_1BYTE_KIND: { \
10092 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010093 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 break; \
10095 } \
10096 case PyUnicode_2BYTE_KIND: { \
10097 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10098 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10099 break; \
10100 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010101 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10103 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10104 break; \
10105 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010106 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 } \
10108 } while (0)
10109
Victor Stinnerd3f08822012-05-29 12:57:52 +020010110void
10111_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10112 Py_UCS4 fill_char)
10113{
10114 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10115 const void *data = PyUnicode_DATA(unicode);
10116 assert(PyUnicode_IS_READY(unicode));
10117 assert(unicode_modifiable(unicode));
10118 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10119 assert(start >= 0);
10120 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10121 FILL(kind, data, fill_char, start, length);
10122}
10123
Victor Stinner3fe55312012-01-04 00:33:50 +010010124Py_ssize_t
10125PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10126 Py_UCS4 fill_char)
10127{
10128 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010129
10130 if (!PyUnicode_Check(unicode)) {
10131 PyErr_BadInternalCall();
10132 return -1;
10133 }
10134 if (PyUnicode_READY(unicode) == -1)
10135 return -1;
10136 if (unicode_check_modifiable(unicode))
10137 return -1;
10138
Victor Stinnerd3f08822012-05-29 12:57:52 +020010139 if (start < 0) {
10140 PyErr_SetString(PyExc_IndexError, "string index out of range");
10141 return -1;
10142 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010143 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10144 PyErr_SetString(PyExc_ValueError,
10145 "fill character is bigger than "
10146 "the string maximum character");
10147 return -1;
10148 }
10149
10150 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10151 length = Py_MIN(maxlen, length);
10152 if (length <= 0)
10153 return 0;
10154
Victor Stinnerd3f08822012-05-29 12:57:52 +020010155 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010156 return length;
10157}
10158
Victor Stinner9310abb2011-10-05 00:59:23 +020010159static PyObject *
10160pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010161 Py_ssize_t left,
10162 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 PyObject *u;
10166 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010167 int kind;
10168 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169
10170 if (left < 0)
10171 left = 0;
10172 if (right < 0)
10173 right = 0;
10174
Victor Stinnerc4b49542011-12-11 22:44:26 +010010175 if (left == 0 && right == 0)
10176 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10179 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010180 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10181 return NULL;
10182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010184 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010186 if (!u)
10187 return NULL;
10188
10189 kind = PyUnicode_KIND(u);
10190 data = PyUnicode_DATA(u);
10191 if (left)
10192 FILL(kind, data, fill, 0, left);
10193 if (right)
10194 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010195 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010196 assert(_PyUnicode_CheckConsistency(u, 1));
10197 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198}
10199
Alexander Belopolsky40018472011-02-26 01:02:56 +000010200PyObject *
10201PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010205 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207
Benjamin Petersonead6b532011-12-20 17:23:42 -060010208 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 if (PyUnicode_IS_ASCII(string))
10211 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 PyUnicode_GET_LENGTH(string), keepends);
10214 else
10215 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010216 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010217 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 break;
10219 case PyUnicode_2BYTE_KIND:
10220 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010221 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 PyUnicode_GET_LENGTH(string), keepends);
10223 break;
10224 case PyUnicode_4BYTE_KIND:
10225 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010226 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 PyUnicode_GET_LENGTH(string), keepends);
10228 break;
10229 default:
10230 assert(0);
10231 list = 0;
10232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234}
10235
Alexander Belopolsky40018472011-02-26 01:02:56 +000010236static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010237split(PyObject *self,
10238 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010239 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010241 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 void *buf1, *buf2;
10243 Py_ssize_t len1, len2;
10244 PyObject* out;
10245
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010247 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (PyUnicode_READY(self) == -1)
10250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010253 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010255 if (PyUnicode_IS_ASCII(self))
10256 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 PyUnicode_GET_LENGTH(self), maxcount
10259 );
10260 else
10261 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010263 PyUnicode_GET_LENGTH(self), maxcount
10264 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 case PyUnicode_2BYTE_KIND:
10266 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 case PyUnicode_4BYTE_KIND:
10271 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 default:
10276 assert(0);
10277 return NULL;
10278 }
10279
10280 if (PyUnicode_READY(substring) == -1)
10281 return NULL;
10282
10283 kind1 = PyUnicode_KIND(self);
10284 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 len1 = PyUnicode_GET_LENGTH(self);
10286 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010287 if (kind1 < kind2 || len1 < len2) {
10288 out = PyList_New(1);
10289 if (out == NULL)
10290 return NULL;
10291 Py_INCREF(self);
10292 PyList_SET_ITEM(out, 0, self);
10293 return out;
10294 }
10295 buf1 = PyUnicode_DATA(self);
10296 buf2 = PyUnicode_DATA(substring);
10297 if (kind2 != kind1) {
10298 buf2 = _PyUnicode_AsKind(substring, kind1);
10299 if (!buf2)
10300 return NULL;
10301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010303 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010305 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10306 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010307 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010308 else
10309 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010310 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 break;
10312 case PyUnicode_2BYTE_KIND:
10313 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010314 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 break;
10316 case PyUnicode_4BYTE_KIND:
10317 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010318 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 break;
10320 default:
10321 out = NULL;
10322 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010323 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 PyMem_Free(buf2);
10325 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326}
10327
Alexander Belopolsky40018472011-02-26 01:02:56 +000010328static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010329rsplit(PyObject *self,
10330 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010331 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010332{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010333 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 void *buf1, *buf2;
10335 Py_ssize_t len1, len2;
10336 PyObject* out;
10337
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010338 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010339 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (PyUnicode_READY(self) == -1)
10342 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010345 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 if (PyUnicode_IS_ASCII(self))
10348 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 PyUnicode_GET_LENGTH(self), maxcount
10351 );
10352 else
10353 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010355 PyUnicode_GET_LENGTH(self), maxcount
10356 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 case PyUnicode_2BYTE_KIND:
10358 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 PyUnicode_GET_LENGTH(self), maxcount
10361 );
10362 case PyUnicode_4BYTE_KIND:
10363 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010364 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 PyUnicode_GET_LENGTH(self), maxcount
10366 );
10367 default:
10368 assert(0);
10369 return NULL;
10370 }
10371
10372 if (PyUnicode_READY(substring) == -1)
10373 return NULL;
10374
10375 kind1 = PyUnicode_KIND(self);
10376 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 len1 = PyUnicode_GET_LENGTH(self);
10378 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010379 if (kind1 < kind2 || len1 < len2) {
10380 out = PyList_New(1);
10381 if (out == NULL)
10382 return NULL;
10383 Py_INCREF(self);
10384 PyList_SET_ITEM(out, 0, self);
10385 return out;
10386 }
10387 buf1 = PyUnicode_DATA(self);
10388 buf2 = PyUnicode_DATA(substring);
10389 if (kind2 != kind1) {
10390 buf2 = _PyUnicode_AsKind(substring, kind1);
10391 if (!buf2)
10392 return NULL;
10393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010397 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10398 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010400 else
10401 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010402 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 break;
10404 case PyUnicode_2BYTE_KIND:
10405 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010406 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 break;
10408 case PyUnicode_4BYTE_KIND:
10409 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010410 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 break;
10412 default:
10413 out = NULL;
10414 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010415 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 PyMem_Free(buf2);
10417 return out;
10418}
10419
10420static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10422 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010424 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10427 return asciilib_find(buf1, len1, buf2, len2, offset);
10428 else
10429 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 case PyUnicode_2BYTE_KIND:
10431 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10432 case PyUnicode_4BYTE_KIND:
10433 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10434 }
10435 assert(0);
10436 return -1;
10437}
10438
10439static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010440anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10441 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010443 switch (kind) {
10444 case PyUnicode_1BYTE_KIND:
10445 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10446 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10447 else
10448 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10449 case PyUnicode_2BYTE_KIND:
10450 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10451 case PyUnicode_4BYTE_KIND:
10452 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10453 }
10454 assert(0);
10455 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010456}
10457
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010458static void
10459replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10460 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10461{
10462 int kind = PyUnicode_KIND(u);
10463 void *data = PyUnicode_DATA(u);
10464 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10465 if (kind == PyUnicode_1BYTE_KIND) {
10466 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10467 (Py_UCS1 *)data + len,
10468 u1, u2, maxcount);
10469 }
10470 else if (kind == PyUnicode_2BYTE_KIND) {
10471 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10472 (Py_UCS2 *)data + len,
10473 u1, u2, maxcount);
10474 }
10475 else {
10476 assert(kind == PyUnicode_4BYTE_KIND);
10477 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10478 (Py_UCS4 *)data + len,
10479 u1, u2, maxcount);
10480 }
10481}
10482
Alexander Belopolsky40018472011-02-26 01:02:56 +000010483static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484replace(PyObject *self, PyObject *str1,
10485 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 PyObject *u;
10488 char *sbuf = PyUnicode_DATA(self);
10489 char *buf1 = PyUnicode_DATA(str1);
10490 char *buf2 = PyUnicode_DATA(str2);
10491 int srelease = 0, release1 = 0, release2 = 0;
10492 int skind = PyUnicode_KIND(self);
10493 int kind1 = PyUnicode_KIND(str1);
10494 int kind2 = PyUnicode_KIND(str2);
10495 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10496 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10497 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010498 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010499 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500
10501 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010504 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
Victor Stinner59de0ee2011-10-07 10:01:28 +020010506 if (str1 == str2)
10507 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508
Victor Stinner49a0a212011-10-12 23:46:10 +020010509 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010510 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10511 if (maxchar < maxchar_str1)
10512 /* substring too wide to be present */
10513 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010514 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10515 /* Replacing str1 with str2 may cause a maxchar reduction in the
10516 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010517 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010518 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010523 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010525 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010526 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010527 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010528
Victor Stinner69ed0f42013-04-09 21:48:24 +020010529 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010530 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010531 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010533 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010537
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010538 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10539 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 }
10541 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 int rkind = skind;
10543 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010544 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (kind1 < rkind) {
10547 /* widen substring */
10548 buf1 = _PyUnicode_AsKind(str1, rkind);
10549 if (!buf1) goto error;
10550 release1 = 1;
10551 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553 if (i < 0)
10554 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 if (rkind > kind2) {
10556 /* widen replacement */
10557 buf2 = _PyUnicode_AsKind(str2, rkind);
10558 if (!buf2) goto error;
10559 release2 = 1;
10560 }
10561 else if (rkind < kind2) {
10562 /* widen self and buf1 */
10563 rkind = kind2;
10564 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010565 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 sbuf = _PyUnicode_AsKind(self, rkind);
10567 if (!sbuf) goto error;
10568 srelease = 1;
10569 buf1 = _PyUnicode_AsKind(str1, rkind);
10570 if (!buf1) goto error;
10571 release1 = 1;
10572 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 u = PyUnicode_New(slen, maxchar);
10574 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 assert(PyUnicode_KIND(u) == rkind);
10577 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010578
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010580 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010585
10586 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010590 if (i == -1)
10591 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010592 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 }
10599 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010601 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 int rkind = skind;
10603 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf1 = _PyUnicode_AsKind(str1, rkind);
10608 if (!buf1) goto error;
10609 release1 = 1;
10610 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010611 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 if (n == 0)
10613 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 buf2 = _PyUnicode_AsKind(str2, rkind);
10617 if (!buf2) goto error;
10618 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010621 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 rkind = kind2;
10623 sbuf = _PyUnicode_AsKind(self, rkind);
10624 if (!sbuf) goto error;
10625 srelease = 1;
10626 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010627 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 buf1 = _PyUnicode_AsKind(str1, rkind);
10629 if (!buf1) goto error;
10630 release1 = 1;
10631 }
10632 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10633 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010634 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 PyErr_SetString(PyExc_OverflowError,
10636 "replace string is too long");
10637 goto error;
10638 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010639 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010640 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010641 _Py_INCREF_UNICODE_EMPTY();
10642 if (!unicode_empty)
10643 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010644 u = unicode_empty;
10645 goto done;
10646 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010647 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 PyErr_SetString(PyExc_OverflowError,
10649 "replace string is too long");
10650 goto error;
10651 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010652 u = PyUnicode_New(new_size, maxchar);
10653 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010655 assert(PyUnicode_KIND(u) == rkind);
10656 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 ires = i = 0;
10658 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 while (n-- > 0) {
10660 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010663 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010664 if (j == -1)
10665 break;
10666 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 memcpy(res + rkind * ires,
10669 sbuf + rkind * i,
10670 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
10673 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010677 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 memcpy(res + rkind * ires,
10685 sbuf + rkind * i,
10686 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 }
10688 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689 /* interleave */
10690 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010691 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010693 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 if (--n <= 0)
10696 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010697 memcpy(res + rkind * ires,
10698 sbuf + rkind * i,
10699 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 ires++;
10701 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010703 memcpy(res + rkind * ires,
10704 sbuf + rkind * i,
10705 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010707 }
10708
10709 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010710 unicode_adjust_maxchar(&u);
10711 if (u == NULL)
10712 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010714
10715 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 if (srelease)
10717 PyMem_FREE(sbuf);
10718 if (release1)
10719 PyMem_FREE(buf1);
10720 if (release2)
10721 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010722 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 if (srelease)
10728 PyMem_FREE(sbuf);
10729 if (release1)
10730 PyMem_FREE(buf1);
10731 if (release2)
10732 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010733 return unicode_result_unchanged(self);
10734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 error:
10736 if (srelease && sbuf)
10737 PyMem_FREE(sbuf);
10738 if (release1 && buf1)
10739 PyMem_FREE(buf1);
10740 if (release2 && buf2)
10741 PyMem_FREE(buf2);
10742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743}
10744
10745/* --- Unicode Object Methods --------------------------------------------- */
10746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010747PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749\n\
10750Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010751characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
10753static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010754unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010756 if (PyUnicode_READY(self) == -1)
10757 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010758 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759}
10760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010761PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763\n\
10764Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010765have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
10767static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010768unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010770 if (PyUnicode_READY(self) == -1)
10771 return NULL;
10772 if (PyUnicode_GET_LENGTH(self) == 0)
10773 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010774 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775}
10776
Benjamin Petersond5890c82012-01-14 13:23:30 -050010777PyDoc_STRVAR(casefold__doc__,
10778 "S.casefold() -> str\n\
10779\n\
10780Return a version of S suitable for caseless comparisons.");
10781
10782static PyObject *
10783unicode_casefold(PyObject *self)
10784{
10785 if (PyUnicode_READY(self) == -1)
10786 return NULL;
10787 if (PyUnicode_IS_ASCII(self))
10788 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010789 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010790}
10791
10792
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010793/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010794
10795static int
10796convert_uc(PyObject *obj, void *addr)
10797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010799
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010800 if (!PyUnicode_Check(obj)) {
10801 PyErr_Format(PyExc_TypeError,
10802 "The fill character must be a unicode character, "
10803 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010804 return 0;
10805 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010806 if (PyUnicode_READY(obj) < 0)
10807 return 0;
10808 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010809 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010810 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010811 return 0;
10812 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010813 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010814 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010815}
10816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010817PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010818 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010820Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010821done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822
10823static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010824unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010826 Py_ssize_t marg, left;
10827 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 Py_UCS4 fillchar = ' ';
10829
Victor Stinnere9a29352011-10-01 02:14:59 +020010830 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
Benjamin Petersonbac79492012-01-14 13:34:47 -050010833 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 return NULL;
10835
Victor Stinnerc4b49542011-12-11 22:44:26 +010010836 if (PyUnicode_GET_LENGTH(self) >= width)
10837 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
Victor Stinnerc4b49542011-12-11 22:44:26 +010010839 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840 left = marg / 2 + (marg & width & 1);
10841
Victor Stinner9310abb2011-10-05 00:59:23 +020010842 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843}
10844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845/* This function assumes that str1 and str2 are readied by the caller. */
10846
Marc-André Lemburge5034372000-08-08 08:04:29 +000010847static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010848unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010849{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010850#define COMPARE(TYPE1, TYPE2) \
10851 do { \
10852 TYPE1* p1 = (TYPE1 *)data1; \
10853 TYPE2* p2 = (TYPE2 *)data2; \
10854 TYPE1* end = p1 + len; \
10855 Py_UCS4 c1, c2; \
10856 for (; p1 != end; p1++, p2++) { \
10857 c1 = *p1; \
10858 c2 = *p2; \
10859 if (c1 != c2) \
10860 return (c1 < c2) ? -1 : 1; \
10861 } \
10862 } \
10863 while (0)
10864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 int kind1, kind2;
10866 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010867 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 kind1 = PyUnicode_KIND(str1);
10870 kind2 = PyUnicode_KIND(str2);
10871 data1 = PyUnicode_DATA(str1);
10872 data2 = PyUnicode_DATA(str2);
10873 len1 = PyUnicode_GET_LENGTH(str1);
10874 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010875 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010876
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010877 switch(kind1) {
10878 case PyUnicode_1BYTE_KIND:
10879 {
10880 switch(kind2) {
10881 case PyUnicode_1BYTE_KIND:
10882 {
10883 int cmp = memcmp(data1, data2, len);
10884 /* normalize result of memcmp() into the range [-1; 1] */
10885 if (cmp < 0)
10886 return -1;
10887 if (cmp > 0)
10888 return 1;
10889 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010890 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010891 case PyUnicode_2BYTE_KIND:
10892 COMPARE(Py_UCS1, Py_UCS2);
10893 break;
10894 case PyUnicode_4BYTE_KIND:
10895 COMPARE(Py_UCS1, Py_UCS4);
10896 break;
10897 default:
10898 assert(0);
10899 }
10900 break;
10901 }
10902 case PyUnicode_2BYTE_KIND:
10903 {
10904 switch(kind2) {
10905 case PyUnicode_1BYTE_KIND:
10906 COMPARE(Py_UCS2, Py_UCS1);
10907 break;
10908 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010909 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010910 COMPARE(Py_UCS2, Py_UCS2);
10911 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010912 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010913 case PyUnicode_4BYTE_KIND:
10914 COMPARE(Py_UCS2, Py_UCS4);
10915 break;
10916 default:
10917 assert(0);
10918 }
10919 break;
10920 }
10921 case PyUnicode_4BYTE_KIND:
10922 {
10923 switch(kind2) {
10924 case PyUnicode_1BYTE_KIND:
10925 COMPARE(Py_UCS4, Py_UCS1);
10926 break;
10927 case PyUnicode_2BYTE_KIND:
10928 COMPARE(Py_UCS4, Py_UCS2);
10929 break;
10930 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010931 {
10932#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10933 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10934 /* normalize result of wmemcmp() into the range [-1; 1] */
10935 if (cmp < 0)
10936 return -1;
10937 if (cmp > 0)
10938 return 1;
10939#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010940 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010941#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010942 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010943 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010944 default:
10945 assert(0);
10946 }
10947 break;
10948 }
10949 default:
10950 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010951 }
10952
Victor Stinner770e19e2012-10-04 22:59:45 +020010953 if (len1 == len2)
10954 return 0;
10955 if (len1 < len2)
10956 return -1;
10957 else
10958 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010959
10960#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010961}
10962
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010963Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010964unicode_compare_eq(PyObject *str1, PyObject *str2)
10965{
10966 int kind;
10967 void *data1, *data2;
10968 Py_ssize_t len;
10969 int cmp;
10970
Victor Stinnere5567ad2012-10-23 02:48:49 +020010971 len = PyUnicode_GET_LENGTH(str1);
10972 if (PyUnicode_GET_LENGTH(str2) != len)
10973 return 0;
10974 kind = PyUnicode_KIND(str1);
10975 if (PyUnicode_KIND(str2) != kind)
10976 return 0;
10977 data1 = PyUnicode_DATA(str1);
10978 data2 = PyUnicode_DATA(str2);
10979
10980 cmp = memcmp(data1, data2, len * kind);
10981 return (cmp == 0);
10982}
10983
10984
Alexander Belopolsky40018472011-02-26 01:02:56 +000010985int
10986PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10989 if (PyUnicode_READY(left) == -1 ||
10990 PyUnicode_READY(right) == -1)
10991 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010992
10993 /* a string is equal to itself */
10994 if (left == right)
10995 return 0;
10996
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010997 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010999 PyErr_Format(PyExc_TypeError,
11000 "Can't compare %.100s and %.100s",
11001 left->ob_type->tp_name,
11002 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 return -1;
11004}
11005
Martin v. Löwis5b222132007-06-10 09:51:05 +000011006int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010011007_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
11008{
11009 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
11010 if (right_str == NULL)
11011 return -1;
11012 return PyUnicode_Compare(left, right_str);
11013}
11014
11015int
Martin v. Löwis5b222132007-06-10 09:51:05 +000011016PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 Py_ssize_t i;
11019 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 Py_UCS4 chr;
11021
Victor Stinner910337b2011-10-03 03:20:16 +020011022 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 if (PyUnicode_READY(uni) == -1)
11024 return -1;
11025 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011026 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011027 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011028 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011029 size_t len, len2 = strlen(str);
11030 int cmp;
11031
11032 len = Py_MIN(len1, len2);
11033 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011034 if (cmp != 0) {
11035 if (cmp < 0)
11036 return -1;
11037 else
11038 return 1;
11039 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011040 if (len1 > len2)
11041 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011042 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011043 return -1; /* str is longer */
11044 return 0;
11045 }
11046 else {
11047 void *data = PyUnicode_DATA(uni);
11048 /* Compare Unicode string and source character set string */
11049 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011050 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011051 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11052 /* This check keeps Python strings that end in '\0' from comparing equal
11053 to C strings identical up to that point. */
11054 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11055 return 1; /* uni is longer */
11056 if (str[i])
11057 return -1; /* str is longer */
11058 return 0;
11059 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011060}
11061
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011062
Benjamin Peterson29060642009-01-31 22:14:21 +000011063#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011064 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011065
Alexander Belopolsky40018472011-02-26 01:02:56 +000011066PyObject *
11067PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011068{
11069 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011070 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011071
Victor Stinnere5567ad2012-10-23 02:48:49 +020011072 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11073 Py_RETURN_NOTIMPLEMENTED;
11074
11075 if (PyUnicode_READY(left) == -1 ||
11076 PyUnicode_READY(right) == -1)
11077 return NULL;
11078
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011079 if (left == right) {
11080 switch (op) {
11081 case Py_EQ:
11082 case Py_LE:
11083 case Py_GE:
11084 /* a string is equal to itself */
11085 v = Py_True;
11086 break;
11087 case Py_NE:
11088 case Py_LT:
11089 case Py_GT:
11090 v = Py_False;
11091 break;
11092 default:
11093 PyErr_BadArgument();
11094 return NULL;
11095 }
11096 }
11097 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011098 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011099 result ^= (op == Py_NE);
11100 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011101 }
11102 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011103 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011104
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011105 /* Convert the return value to a Boolean */
11106 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011107 case Py_LE:
11108 v = TEST_COND(result <= 0);
11109 break;
11110 case Py_GE:
11111 v = TEST_COND(result >= 0);
11112 break;
11113 case Py_LT:
11114 v = TEST_COND(result == -1);
11115 break;
11116 case Py_GT:
11117 v = TEST_COND(result == 1);
11118 break;
11119 default:
11120 PyErr_BadArgument();
11121 return NULL;
11122 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011123 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011124 Py_INCREF(v);
11125 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011126}
11127
Alexander Belopolsky40018472011-02-26 01:02:56 +000011128int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011129_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11130{
11131 return unicode_eq(aa, bb);
11132}
11133
11134int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011135PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011136{
Victor Stinner77282cb2013-04-14 19:22:47 +020011137 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 void *buf1, *buf2;
11139 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011140 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011141
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011144 "'in <string>' requires string as left operand, not %.100s",
11145 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011147 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011148 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011150 if (ensure_unicode(str) < 0)
11151 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011154 kind2 = PyUnicode_KIND(substr);
11155 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011156 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011158 len2 = PyUnicode_GET_LENGTH(substr);
11159 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011160 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011161 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011162 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011163 if (len2 == 1) {
11164 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11165 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 return result;
11167 }
11168 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011169 buf2 = _PyUnicode_AsKind(substr, kind1);
11170 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011171 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173
Victor Stinner77282cb2013-04-14 19:22:47 +020011174 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 case PyUnicode_1BYTE_KIND:
11176 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11177 break;
11178 case PyUnicode_2BYTE_KIND:
11179 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11180 break;
11181 case PyUnicode_4BYTE_KIND:
11182 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11183 break;
11184 default:
11185 result = -1;
11186 assert(0);
11187 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011188
Victor Stinner77282cb2013-04-14 19:22:47 +020011189 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 PyMem_Free(buf2);
11191
Guido van Rossum403d68b2000-03-13 15:55:09 +000011192 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011193}
11194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195/* Concat to string or Unicode object giving a new Unicode object. */
11196
Alexander Belopolsky40018472011-02-26 01:02:56 +000011197PyObject *
11198PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011201 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011204 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
11207 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011208 if (left == unicode_empty)
11209 return PyUnicode_FromObject(right);
11210 if (right == unicode_empty)
11211 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011213 left_len = PyUnicode_GET_LENGTH(left);
11214 right_len = PyUnicode_GET_LENGTH(right);
11215 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011216 PyErr_SetString(PyExc_OverflowError,
11217 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011218 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011219 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011221
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11223 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011224 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227 result = PyUnicode_New(new_len, maxchar);
11228 if (result == NULL)
11229 return NULL;
11230 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11231 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11232 assert(_PyUnicode_CheckConsistency(result, 1));
11233 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234}
11235
Walter Dörwald1ab83302007-05-18 17:15:44 +000011236void
Victor Stinner23e56682011-10-03 03:54:37 +020011237PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011238{
Victor Stinner23e56682011-10-03 03:54:37 +020011239 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011240 Py_UCS4 maxchar, maxchar2;
11241 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011242
11243 if (p_left == NULL) {
11244 if (!PyErr_Occurred())
11245 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011246 return;
11247 }
Victor Stinner23e56682011-10-03 03:54:37 +020011248 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011249 if (right == NULL || left == NULL
11250 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011251 if (!PyErr_Occurred())
11252 PyErr_BadInternalCall();
11253 goto error;
11254 }
11255
Benjamin Petersonbac79492012-01-14 13:34:47 -050011256 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011257 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011258 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011259 goto error;
11260
Victor Stinner488fa492011-12-12 00:01:39 +010011261 /* Shortcuts */
11262 if (left == unicode_empty) {
11263 Py_DECREF(left);
11264 Py_INCREF(right);
11265 *p_left = right;
11266 return;
11267 }
11268 if (right == unicode_empty)
11269 return;
11270
11271 left_len = PyUnicode_GET_LENGTH(left);
11272 right_len = PyUnicode_GET_LENGTH(right);
11273 if (left_len > PY_SSIZE_T_MAX - right_len) {
11274 PyErr_SetString(PyExc_OverflowError,
11275 "strings are too large to concat");
11276 goto error;
11277 }
11278 new_len = left_len + right_len;
11279
11280 if (unicode_modifiable(left)
11281 && PyUnicode_CheckExact(right)
11282 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011283 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11284 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011285 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011286 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011287 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11288 {
11289 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011290 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011291 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011292
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011293 /* copy 'right' into the newly allocated area of 'left' */
11294 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011295 }
Victor Stinner488fa492011-12-12 00:01:39 +010011296 else {
11297 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11298 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011299 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011300
Victor Stinner488fa492011-12-12 00:01:39 +010011301 /* Concat the two Unicode strings */
11302 res = PyUnicode_New(new_len, maxchar);
11303 if (res == NULL)
11304 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011305 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11306 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011307 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011308 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011309 }
11310 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011311 return;
11312
11313error:
Victor Stinner488fa492011-12-12 00:01:39 +010011314 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011315}
11316
11317void
11318PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011320 PyUnicode_Append(pleft, right);
11321 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011322}
11323
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011324/*
11325Wraps stringlib_parse_args_finds() and additionally ensures that the
11326first argument is a unicode object.
11327*/
11328
11329Py_LOCAL_INLINE(int)
11330parse_args_finds_unicode(const char * function_name, PyObject *args,
11331 PyObject **substring,
11332 Py_ssize_t *start, Py_ssize_t *end)
11333{
11334 if(stringlib_parse_args_finds(function_name, args, substring,
11335 start, end)) {
11336 if (ensure_unicode(*substring) < 0)
11337 return 0;
11338 return 1;
11339 }
11340 return 0;
11341}
11342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011343PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011346Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011347string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
11350static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011351unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011353 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011354 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011355 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011357 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 void *buf1, *buf2;
11359 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011361 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 kind1 = PyUnicode_KIND(self);
11365 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011366 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011367 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 len1 = PyUnicode_GET_LENGTH(self);
11370 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011372 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011373 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011374
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011375 buf1 = PyUnicode_DATA(self);
11376 buf2 = PyUnicode_DATA(substring);
11377 if (kind2 != kind1) {
11378 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011379 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011380 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011381 }
11382 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 case PyUnicode_1BYTE_KIND:
11384 iresult = ucs1lib_count(
11385 ((Py_UCS1*)buf1) + start, end - start,
11386 buf2, len2, PY_SSIZE_T_MAX
11387 );
11388 break;
11389 case PyUnicode_2BYTE_KIND:
11390 iresult = ucs2lib_count(
11391 ((Py_UCS2*)buf1) + start, end - start,
11392 buf2, len2, PY_SSIZE_T_MAX
11393 );
11394 break;
11395 case PyUnicode_4BYTE_KIND:
11396 iresult = ucs4lib_count(
11397 ((Py_UCS4*)buf1) + start, end - start,
11398 buf2, len2, PY_SSIZE_T_MAX
11399 );
11400 break;
11401 default:
11402 assert(0); iresult = 0;
11403 }
11404
11405 result = PyLong_FromSsize_t(iresult);
11406
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011407 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410 return result;
11411}
11412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011414 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011416Encode S using the codec registered for encoding. Default encoding\n\
11417is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011418handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011419a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11420'xmlcharrefreplace' as well as any other name registered with\n\
11421codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
11423static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011424unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011426 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427 char *encoding = NULL;
11428 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011429
Benjamin Peterson308d6372009-09-18 21:42:35 +000011430 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11431 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011433 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011434}
11435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011436PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011437 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438\n\
11439Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011440If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011443unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 Py_ssize_t i, j, line_pos, src_len, incr;
11446 Py_UCS4 ch;
11447 PyObject *u;
11448 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011449 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011451 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011452 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Ezio Melotti745d54d2013-11-16 19:10:57 +020011454 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11455 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
Antoine Pitrou22425222011-10-04 19:10:51 +020011458 if (PyUnicode_READY(self) == -1)
11459 return NULL;
11460
Thomas Wouters7e474022000-07-16 12:04:32 +000011461 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 src_len = PyUnicode_GET_LENGTH(self);
11463 i = j = line_pos = 0;
11464 kind = PyUnicode_KIND(self);
11465 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011466 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 for (; i < src_len; i++) {
11468 ch = PyUnicode_READ(kind, src_data, i);
11469 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011470 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 goto overflow;
11475 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011477 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 goto overflow;
11482 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 if (ch == '\n' || ch == '\r')
11485 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011487 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011488 if (!found)
11489 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011492 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 if (!u)
11494 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011495 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Antoine Pitroue71d5742011-10-04 15:55:09 +020011497 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Antoine Pitroue71d5742011-10-04 15:55:09 +020011499 for (; i < src_len; i++) {
11500 ch = PyUnicode_READ(kind, src_data, i);
11501 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011503 incr = tabsize - (line_pos % tabsize);
11504 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011505 FILL(kind, dest_data, ' ', j, incr);
11506 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011508 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 line_pos++;
11511 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011512 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011513 if (ch == '\n' || ch == '\r')
11514 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011516 }
11517 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011518 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011519
Antoine Pitroue71d5742011-10-04 15:55:09 +020011520 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011521 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523}
11524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527\n\
11528Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011529such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530arguments start and end are interpreted as in slice notation.\n\
11531\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
11534static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011537 /* initialize variables to prevent gcc warning */
11538 PyObject *substring = NULL;
11539 Py_ssize_t start = 0;
11540 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011541 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011543 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011546 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011549 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (result == -2)
11552 return NULL;
11553
Christian Heimes217cfd12007-12-02 14:31:20 +000011554 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555}
11556
11557static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011558unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011560 void *data;
11561 enum PyUnicode_Kind kind;
11562 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011563
11564 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11565 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011567 }
11568 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11569 PyErr_SetString(PyExc_IndexError, "string index out of range");
11570 return NULL;
11571 }
11572 kind = PyUnicode_KIND(self);
11573 data = PyUnicode_DATA(self);
11574 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011575 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576}
11577
Guido van Rossumc2504932007-09-18 19:42:40 +000011578/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011579 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011580static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011581unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582{
Guido van Rossumc2504932007-09-18 19:42:40 +000011583 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011584 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011585
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011586#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011587 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011588#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 if (_PyUnicode_HASH(self) != -1)
11590 return _PyUnicode_HASH(self);
11591 if (PyUnicode_READY(self) == -1)
11592 return -1;
11593 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011594 /*
11595 We make the hash of the empty string be 0, rather than using
11596 (prefix ^ suffix), since this slightly obfuscates the hash secret
11597 */
11598 if (len == 0) {
11599 _PyUnicode_HASH(self) = 0;
11600 return 0;
11601 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011602 x = _Py_HashBytes(PyUnicode_DATA(self),
11603 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011605 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011608PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011609 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
11613static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011616 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011617 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011618 PyObject *substring = NULL;
11619 Py_ssize_t start = 0;
11620 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011622 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011625 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011628 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (result == -2)
11631 return NULL;
11632
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 if (result < 0) {
11634 PyErr_SetString(PyExc_ValueError, "substring not found");
11635 return NULL;
11636 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011637
Christian Heimes217cfd12007-12-02 14:31:20 +000011638 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639}
11640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011641PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011644Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
11647static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011648unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 Py_ssize_t i, length;
11651 int kind;
11652 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 int cased;
11654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (PyUnicode_READY(self) == -1)
11656 return NULL;
11657 length = PyUnicode_GET_LENGTH(self);
11658 kind = PyUnicode_KIND(self);
11659 data = PyUnicode_DATA(self);
11660
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (length == 1)
11663 return PyBool_FromLong(
11664 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011666 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011669
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 for (i = 0; i < length; i++) {
11672 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011673
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11675 return PyBool_FromLong(0);
11676 else if (!cased && Py_UNICODE_ISLOWER(ch))
11677 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011679 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680}
11681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011682PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011685Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
11688static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011689unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 Py_ssize_t i, length;
11692 int kind;
11693 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694 int cased;
11695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (PyUnicode_READY(self) == -1)
11697 return NULL;
11698 length = PyUnicode_GET_LENGTH(self);
11699 kind = PyUnicode_KIND(self);
11700 data = PyUnicode_DATA(self);
11701
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 if (length == 1)
11704 return PyBool_FromLong(
11705 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011707 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011710
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 for (i = 0; i < length; i++) {
11713 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011714
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11716 return PyBool_FromLong(0);
11717 else if (!cased && Py_UNICODE_ISUPPER(ch))
11718 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011720 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721}
11722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011726Return True if S is a titlecased string and there is at least one\n\
11727character in S, i.e. upper- and titlecase characters may only\n\
11728follow uncased characters and lowercase characters only cased ones.\n\
11729Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
11731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 Py_ssize_t i, length;
11735 int kind;
11736 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 int cased, previous_is_cased;
11738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (PyUnicode_READY(self) == -1)
11740 return NULL;
11741 length = PyUnicode_GET_LENGTH(self);
11742 kind = PyUnicode_KIND(self);
11743 data = PyUnicode_DATA(self);
11744
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (length == 1) {
11747 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11748 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11749 (Py_UNICODE_ISUPPER(ch) != 0));
11750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011755
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 cased = 0;
11757 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 for (i = 0; i < length; i++) {
11759 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011760
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11762 if (previous_is_cased)
11763 return PyBool_FromLong(0);
11764 previous_is_cased = 1;
11765 cased = 1;
11766 }
11767 else if (Py_UNICODE_ISLOWER(ch)) {
11768 if (!previous_is_cased)
11769 return PyBool_FromLong(0);
11770 previous_is_cased = 1;
11771 cased = 1;
11772 }
11773 else
11774 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011776 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777}
11778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011779PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011782Return True if all characters in S are whitespace\n\
11783and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
11785static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 Py_ssize_t i, length;
11789 int kind;
11790 void *data;
11791
11792 if (PyUnicode_READY(self) == -1)
11793 return NULL;
11794 length = PyUnicode_GET_LENGTH(self);
11795 kind = PyUnicode_KIND(self);
11796 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (length == 1)
11800 return PyBool_FromLong(
11801 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011803 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 for (i = 0; i < length; i++) {
11808 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011809 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011812 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813}
11814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011817\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011818Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011819and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011820
11821static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011822unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 Py_ssize_t i, length;
11825 int kind;
11826 void *data;
11827
11828 if (PyUnicode_READY(self) == -1)
11829 return NULL;
11830 length = PyUnicode_GET_LENGTH(self);
11831 kind = PyUnicode_KIND(self);
11832 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011833
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (length == 1)
11836 return PyBool_FromLong(
11837 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011838
11839 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 for (i = 0; i < length; i++) {
11844 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011846 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011847 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011848}
11849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011850PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011852\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011853Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011854and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011855
11856static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011857unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 int kind;
11860 void *data;
11861 Py_ssize_t len, i;
11862
11863 if (PyUnicode_READY(self) == -1)
11864 return NULL;
11865
11866 kind = PyUnicode_KIND(self);
11867 data = PyUnicode_DATA(self);
11868 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (len == 1) {
11872 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11873 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11874 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011875
11876 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 for (i = 0; i < len; i++) {
11881 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011882 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011885 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011886}
11887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011888PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011891Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011892False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
11894static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011895unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 Py_ssize_t i, length;
11898 int kind;
11899 void *data;
11900
11901 if (PyUnicode_READY(self) == -1)
11902 return NULL;
11903 length = PyUnicode_GET_LENGTH(self);
11904 kind = PyUnicode_KIND(self);
11905 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (length == 1)
11909 return PyBool_FromLong(
11910 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011912 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 for (i = 0; i < length; i++) {
11917 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011920 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921}
11922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011923PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011926Return True if all characters in S are digits\n\
11927and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
11929static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011930unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 Py_ssize_t i, length;
11933 int kind;
11934 void *data;
11935
11936 if (PyUnicode_READY(self) == -1)
11937 return NULL;
11938 length = PyUnicode_GET_LENGTH(self);
11939 kind = PyUnicode_KIND(self);
11940 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (length == 1) {
11944 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11945 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011948 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 for (i = 0; i < length; i++) {
11953 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957}
11958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011959PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011962Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011963False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
11965static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 Py_ssize_t i, length;
11969 int kind;
11970 void *data;
11971
11972 if (PyUnicode_READY(self) == -1)
11973 return NULL;
11974 length = PyUnicode_GET_LENGTH(self);
11975 kind = PyUnicode_KIND(self);
11976 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (length == 1)
11980 return PyBool_FromLong(
11981 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011983 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 for (i = 0; i < length; i++) {
11988 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011991 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992}
11993
Martin v. Löwis47383402007-08-15 07:32:56 +000011994int
11995PyUnicode_IsIdentifier(PyObject *self)
11996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 int kind;
11998 void *data;
11999 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012000 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (PyUnicode_READY(self) == -1) {
12003 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 }
12006
12007 /* Special case for empty strings */
12008 if (PyUnicode_GET_LENGTH(self) == 0)
12009 return 0;
12010 kind = PyUnicode_KIND(self);
12011 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012012
12013 /* PEP 3131 says that the first character must be in
12014 XID_Start and subsequent characters in XID_Continue,
12015 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012016 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012017 letters, digits, underscore). However, given the current
12018 definition of XID_Start and XID_Continue, it is sufficient
12019 to check just for these, except that _ must be allowed
12020 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012022 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012023 return 0;
12024
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012025 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012028 return 1;
12029}
12030
12031PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012033\n\
12034Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012035to the language definition.\n\
12036\n\
12037Use keyword.iskeyword() to test for reserved identifiers\n\
12038such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012039
12040static PyObject*
12041unicode_isidentifier(PyObject *self)
12042{
12043 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12044}
12045
Georg Brandl559e5d72008-06-11 18:37:52 +000012046PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012048\n\
12049Return True if all characters in S are considered\n\
12050printable in repr() or S is empty, False otherwise.");
12051
12052static PyObject*
12053unicode_isprintable(PyObject *self)
12054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 Py_ssize_t i, length;
12056 int kind;
12057 void *data;
12058
12059 if (PyUnicode_READY(self) == -1)
12060 return NULL;
12061 length = PyUnicode_GET_LENGTH(self);
12062 kind = PyUnicode_KIND(self);
12063 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012064
12065 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 if (length == 1)
12067 return PyBool_FromLong(
12068 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 for (i = 0; i < length; i++) {
12071 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012072 Py_RETURN_FALSE;
12073 }
12074 }
12075 Py_RETURN_TRUE;
12076}
12077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012078PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012079 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080\n\
12081Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012082iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
12084static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012085unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012087 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088}
12089
Martin v. Löwis18e16552006-02-15 17:27:45 +000012090static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012091unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 if (PyUnicode_READY(self) == -1)
12094 return -1;
12095 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096}
12097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012098PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012101Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012102done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103
12104static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012105unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012107 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 Py_UCS4 fillchar = ' ';
12109
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012110 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 return NULL;
12112
Benjamin Petersonbac79492012-01-14 13:34:47 -050012113 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012114 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115
Victor Stinnerc4b49542011-12-11 22:44:26 +010012116 if (PyUnicode_GET_LENGTH(self) >= width)
12117 return unicode_result_unchanged(self);
12118
12119 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120}
12121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012122PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012123 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012125Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
12127static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012128unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012130 if (PyUnicode_READY(self) == -1)
12131 return NULL;
12132 if (PyUnicode_IS_ASCII(self))
12133 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012134 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135}
12136
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137#define LEFTSTRIP 0
12138#define RIGHTSTRIP 1
12139#define BOTHSTRIP 2
12140
12141/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012142static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143
12144#define STRIPNAME(i) (stripformat[i]+3)
12145
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146/* externally visible for str.strip(unicode) */
12147PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012148_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 void *data;
12151 int kind;
12152 Py_ssize_t i, j, len;
12153 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012154 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12157 return NULL;
12158
12159 kind = PyUnicode_KIND(self);
12160 data = PyUnicode_DATA(self);
12161 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012162 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12164 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012165 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012166
Benjamin Peterson14339b62009-01-31 16:36:08 +000012167 i = 0;
12168 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012169 while (i < len) {
12170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12171 if (!BLOOM(sepmask, ch))
12172 break;
12173 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12174 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 i++;
12176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012177 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012178
Benjamin Peterson14339b62009-01-31 16:36:08 +000012179 j = len;
12180 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012181 j--;
12182 while (j >= i) {
12183 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12184 if (!BLOOM(sepmask, ch))
12185 break;
12186 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12187 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012189 }
12190
Benjamin Peterson29060642009-01-31 22:14:21 +000012191 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012192 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012193
Victor Stinner7931d9a2011-11-04 00:22:48 +010012194 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195}
12196
12197PyObject*
12198PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12199{
12200 unsigned char *data;
12201 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012202 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203
Victor Stinnerde636f32011-10-01 03:55:54 +020012204 if (PyUnicode_READY(self) == -1)
12205 return NULL;
12206
Victor Stinner684d5fd2012-05-03 02:32:34 +020012207 length = PyUnicode_GET_LENGTH(self);
12208 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012209
Victor Stinner684d5fd2012-05-03 02:32:34 +020012210 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012211 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212
Victor Stinnerde636f32011-10-01 03:55:54 +020012213 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012214 PyErr_SetString(PyExc_IndexError, "string index out of range");
12215 return NULL;
12216 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012217 if (start >= length || end < start)
12218 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012219
Victor Stinner684d5fd2012-05-03 02:32:34 +020012220 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012221 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012222 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012223 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012224 }
12225 else {
12226 kind = PyUnicode_KIND(self);
12227 data = PyUnicode_1BYTE_DATA(self);
12228 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012229 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012230 length);
12231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
12234static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012235do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 Py_ssize_t len, i, j;
12238
12239 if (PyUnicode_READY(self) == -1)
12240 return NULL;
12241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012243
Victor Stinnercc7af722013-04-09 22:39:24 +020012244 if (PyUnicode_IS_ASCII(self)) {
12245 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12246
12247 i = 0;
12248 if (striptype != RIGHTSTRIP) {
12249 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012250 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012251 if (!_Py_ascii_whitespace[ch])
12252 break;
12253 i++;
12254 }
12255 }
12256
12257 j = len;
12258 if (striptype != LEFTSTRIP) {
12259 j--;
12260 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012261 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012262 if (!_Py_ascii_whitespace[ch])
12263 break;
12264 j--;
12265 }
12266 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012267 }
12268 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012269 else {
12270 int kind = PyUnicode_KIND(self);
12271 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012272
Victor Stinnercc7af722013-04-09 22:39:24 +020012273 i = 0;
12274 if (striptype != RIGHTSTRIP) {
12275 while (i < len) {
12276 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12277 if (!Py_UNICODE_ISSPACE(ch))
12278 break;
12279 i++;
12280 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012281 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012282
12283 j = len;
12284 if (striptype != LEFTSTRIP) {
12285 j--;
12286 while (j >= i) {
12287 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12288 if (!Py_UNICODE_ISSPACE(ch))
12289 break;
12290 j--;
12291 }
12292 j++;
12293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012294 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295
Victor Stinner7931d9a2011-11-04 00:22:48 +010012296 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297}
12298
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299
12300static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012301do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
Serhiy Storchakac6792272013-10-19 21:03:34 +030012305 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307
Benjamin Peterson14339b62009-01-31 16:36:08 +000012308 if (sep != NULL && sep != Py_None) {
12309 if (PyUnicode_Check(sep))
12310 return _PyUnicode_XStrip(self, striptype, sep);
12311 else {
12312 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 "%s arg must be None or str",
12314 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012315 return NULL;
12316 }
12317 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012320}
12321
12322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012323PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012325\n\
12326Return a copy of the string S with leading and trailing\n\
12327whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012328If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012329
12330static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012331unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 if (PyTuple_GET_SIZE(args) == 0)
12334 return do_strip(self, BOTHSTRIP); /* Common case */
12335 else
12336 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012337}
12338
12339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012340PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342\n\
12343Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012344If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012345
12346static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012347unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012348{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012349 if (PyTuple_GET_SIZE(args) == 0)
12350 return do_strip(self, LEFTSTRIP); /* Common case */
12351 else
12352 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012353}
12354
12355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012356PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012358\n\
12359Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012360If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012361
12362static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012363unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012364{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012365 if (PyTuple_GET_SIZE(args) == 0)
12366 return do_strip(self, RIGHTSTRIP); /* Common case */
12367 else
12368 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012369}
12370
12371
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012373unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012375 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
Serhiy Storchaka05997252013-01-26 12:14:02 +020012378 if (len < 1)
12379 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
Victor Stinnerc4b49542011-12-11 22:44:26 +010012381 /* no repeat, return original string */
12382 if (len == 1)
12383 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012384
Benjamin Petersonbac79492012-01-14 13:34:47 -050012385 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 return NULL;
12387
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012388 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012389 PyErr_SetString(PyExc_OverflowError,
12390 "repeated string is too long");
12391 return NULL;
12392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012394
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012395 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396 if (!u)
12397 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012398 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 if (PyUnicode_GET_LENGTH(str) == 1) {
12401 const int kind = PyUnicode_KIND(str);
12402 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012403 if (kind == PyUnicode_1BYTE_KIND) {
12404 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012405 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012406 }
12407 else if (kind == PyUnicode_2BYTE_KIND) {
12408 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012409 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012410 ucs2[n] = fill_char;
12411 } else {
12412 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12413 assert(kind == PyUnicode_4BYTE_KIND);
12414 for (n = 0; n < len; ++n)
12415 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 }
12418 else {
12419 /* number of characters copied this far */
12420 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012421 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 char *to = (char *) PyUnicode_DATA(u);
12423 Py_MEMCPY(to, PyUnicode_DATA(str),
12424 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012425 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 n = (done <= nchars-done) ? done : nchars-done;
12427 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012428 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430 }
12431
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012432 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012433 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434}
12435
Alexander Belopolsky40018472011-02-26 01:02:56 +000012436PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012437PyUnicode_Replace(PyObject *str,
12438 PyObject *substr,
12439 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012440 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012442 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12443 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012445 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446}
12447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012448PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012449 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450\n\
12451Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012452old replaced by new. If the optional argument count is\n\
12453given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
12455static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 PyObject *str1;
12459 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012460 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012462 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012464 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012465 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012466 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467}
12468
Alexander Belopolsky40018472011-02-26 01:02:56 +000012469static PyObject *
12470unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012472 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 Py_ssize_t isize;
12474 Py_ssize_t osize, squote, dquote, i, o;
12475 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012476 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012480 return NULL;
12481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 isize = PyUnicode_GET_LENGTH(unicode);
12483 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 /* Compute length of output, quote characters, and
12486 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012487 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 max = 127;
12489 squote = dquote = 0;
12490 ikind = PyUnicode_KIND(unicode);
12491 for (i = 0; i < isize; i++) {
12492 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012493 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012495 case '\'': squote++; break;
12496 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012498 incr = 2;
12499 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 default:
12501 /* Fast-path ASCII */
12502 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012503 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012505 ;
12506 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012509 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012511 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012513 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012515 if (osize > PY_SSIZE_T_MAX - incr) {
12516 PyErr_SetString(PyExc_OverflowError,
12517 "string is too long to generate repr");
12518 return NULL;
12519 }
12520 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 }
12522
12523 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012524 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012526 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 if (dquote)
12528 /* Both squote and dquote present. Use squote,
12529 and escape them */
12530 osize += squote;
12531 else
12532 quote = '"';
12533 }
Victor Stinner55c08782013-04-14 18:45:39 +020012534 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535
12536 repr = PyUnicode_New(osize, max);
12537 if (repr == NULL)
12538 return NULL;
12539 okind = PyUnicode_KIND(repr);
12540 odata = PyUnicode_DATA(repr);
12541
12542 PyUnicode_WRITE(okind, odata, 0, quote);
12543 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012544 if (unchanged) {
12545 _PyUnicode_FastCopyCharacters(repr, 1,
12546 unicode, 0,
12547 isize);
12548 }
12549 else {
12550 for (i = 0, o = 1; i < isize; i++) {
12551 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552
Victor Stinner55c08782013-04-14 18:45:39 +020012553 /* Escape quotes and backslashes */
12554 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012555 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012557 continue;
12558 }
12559
12560 /* Map special whitespace to '\t', \n', '\r' */
12561 if (ch == '\t') {
12562 PyUnicode_WRITE(okind, odata, o++, '\\');
12563 PyUnicode_WRITE(okind, odata, o++, 't');
12564 }
12565 else if (ch == '\n') {
12566 PyUnicode_WRITE(okind, odata, o++, '\\');
12567 PyUnicode_WRITE(okind, odata, o++, 'n');
12568 }
12569 else if (ch == '\r') {
12570 PyUnicode_WRITE(okind, odata, o++, '\\');
12571 PyUnicode_WRITE(okind, odata, o++, 'r');
12572 }
12573
12574 /* Map non-printable US ASCII to '\xhh' */
12575 else if (ch < ' ' || ch == 0x7F) {
12576 PyUnicode_WRITE(okind, odata, o++, '\\');
12577 PyUnicode_WRITE(okind, odata, o++, 'x');
12578 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12579 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12580 }
12581
12582 /* Copy ASCII characters as-is */
12583 else if (ch < 0x7F) {
12584 PyUnicode_WRITE(okind, odata, o++, ch);
12585 }
12586
12587 /* Non-ASCII characters */
12588 else {
12589 /* Map Unicode whitespace and control characters
12590 (categories Z* and C* except ASCII space)
12591 */
12592 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12593 PyUnicode_WRITE(okind, odata, o++, '\\');
12594 /* Map 8-bit characters to '\xhh' */
12595 if (ch <= 0xff) {
12596 PyUnicode_WRITE(okind, odata, o++, 'x');
12597 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12598 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12599 }
12600 /* Map 16-bit characters to '\uxxxx' */
12601 else if (ch <= 0xffff) {
12602 PyUnicode_WRITE(okind, odata, o++, 'u');
12603 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12604 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12605 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12606 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12607 }
12608 /* Map 21-bit characters to '\U00xxxxxx' */
12609 else {
12610 PyUnicode_WRITE(okind, odata, o++, 'U');
12611 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12612 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12613 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12614 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12615 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12616 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12617 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12618 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12619 }
12620 }
12621 /* Copy characters as-is */
12622 else {
12623 PyUnicode_WRITE(okind, odata, o++, ch);
12624 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012625 }
12626 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012629 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012630 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631}
12632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012633PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635\n\
12636Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012637such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638arguments start and end are interpreted as in slice notation.\n\
12639\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012640Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
12642static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012645 /* initialize variables to prevent gcc warning */
12646 PyObject *substring = NULL;
12647 Py_ssize_t start = 0;
12648 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012651 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012654 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012657 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 if (result == -2)
12660 return NULL;
12661
Christian Heimes217cfd12007-12-02 14:31:20 +000012662 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663}
12664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012665PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012668Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
12670static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012673 /* initialize variables to prevent gcc warning */
12674 PyObject *substring = NULL;
12675 Py_ssize_t start = 0;
12676 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012677 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012679 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012682 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012685 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 if (result == -2)
12688 return NULL;
12689
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 if (result < 0) {
12691 PyErr_SetString(PyExc_ValueError, "substring not found");
12692 return NULL;
12693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694
Christian Heimes217cfd12007-12-02 14:31:20 +000012695 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696}
12697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012698PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012701Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012702done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
12704static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012705unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012707 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 Py_UCS4 fillchar = ' ';
12709
Victor Stinnere9a29352011-10-01 02:14:59 +020012710 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012712
Benjamin Petersonbac79492012-01-14 13:34:47 -050012713 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 return NULL;
12715
Victor Stinnerc4b49542011-12-11 22:44:26 +010012716 if (PyUnicode_GET_LENGTH(self) >= width)
12717 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718
Victor Stinnerc4b49542011-12-11 22:44:26 +010012719 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
12721
Alexander Belopolsky40018472011-02-26 01:02:56 +000012722PyObject *
12723PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012725 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012728 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729}
12730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012731PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012732 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733\n\
12734Return a list of the words in S, using sep as the\n\
12735delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012736splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012737whitespace string is a separator and empty strings are\n\
12738removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739
12740static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012741unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012743 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012745 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012747 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12748 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749 return NULL;
12750
12751 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012753
12754 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012755 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012756
12757 PyErr_Format(PyExc_TypeError,
12758 "must be str or None, not %.100s",
12759 Py_TYPE(substring)->tp_name);
12760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761}
12762
Thomas Wouters477c8d52006-05-27 19:21:47 +000012763PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012764PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012765{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012766 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012767 int kind1, kind2;
12768 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012770
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012771 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012773
Victor Stinner14f8f022011-10-05 20:58:25 +020012774 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 len1 = PyUnicode_GET_LENGTH(str_obj);
12777 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012778 if (kind1 < kind2 || len1 < len2) {
12779 _Py_INCREF_UNICODE_EMPTY();
12780 if (!unicode_empty)
12781 out = NULL;
12782 else {
12783 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12784 Py_DECREF(unicode_empty);
12785 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012786 return out;
12787 }
12788 buf1 = PyUnicode_DATA(str_obj);
12789 buf2 = PyUnicode_DATA(sep_obj);
12790 if (kind2 != kind1) {
12791 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12792 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012793 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012796 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012798 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12799 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12800 else
12801 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 break;
12803 case PyUnicode_2BYTE_KIND:
12804 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12805 break;
12806 case PyUnicode_4BYTE_KIND:
12807 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12808 break;
12809 default:
12810 assert(0);
12811 out = 0;
12812 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012814 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816
12817 return out;
12818}
12819
12820
12821PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012822PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012825 int kind1, kind2;
12826 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012829 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012831
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012832 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 len1 = PyUnicode_GET_LENGTH(str_obj);
12835 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012836 if (kind1 < kind2 || len1 < len2) {
12837 _Py_INCREF_UNICODE_EMPTY();
12838 if (!unicode_empty)
12839 out = NULL;
12840 else {
12841 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12842 Py_DECREF(unicode_empty);
12843 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012844 return out;
12845 }
12846 buf1 = PyUnicode_DATA(str_obj);
12847 buf2 = PyUnicode_DATA(sep_obj);
12848 if (kind2 != kind1) {
12849 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12850 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012851 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012854 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012856 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12857 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12858 else
12859 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 break;
12861 case PyUnicode_2BYTE_KIND:
12862 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12863 break;
12864 case PyUnicode_4BYTE_KIND:
12865 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12866 break;
12867 default:
12868 assert(0);
12869 out = 0;
12870 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012872 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012874
12875 return out;
12876}
12877
12878PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012880\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012881Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012882the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012883found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012884
12885static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012886unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012887{
Victor Stinner9310abb2011-10-05 00:59:23 +020012888 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012889}
12890
12891PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012892 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012893\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012894Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012895the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012896separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012897
12898static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012899unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900{
Victor Stinner9310abb2011-10-05 00:59:23 +020012901 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902}
12903
Alexander Belopolsky40018472011-02-26 01:02:56 +000012904PyObject *
12905PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012906{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012907 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012908 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012909
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012910 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012911}
12912
12913PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012914 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012915\n\
12916Return a list of the words in S, using sep as the\n\
12917delimiter string, starting at the end of the string and\n\
12918working to the front. If maxsplit is given, at most maxsplit\n\
12919splits are done. If sep is not specified, any whitespace string\n\
12920is a separator.");
12921
12922static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012923unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012924{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012925 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012926 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012927 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012928
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012929 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12930 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012931 return NULL;
12932
12933 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012935
12936 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012937 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012938
12939 PyErr_Format(PyExc_TypeError,
12940 "must be str or None, not %.100s",
12941 Py_TYPE(substring)->tp_name);
12942 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012943}
12944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012945PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947\n\
12948Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012949Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012950is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951
12952static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012953unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012955 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012956 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012958 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12959 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960 return NULL;
12961
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012962 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
12965static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012966PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012968 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969}
12970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012971PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012972 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973\n\
12974Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012975and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976
12977static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012978unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012980 if (PyUnicode_READY(self) == -1)
12981 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012982 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983}
12984
Larry Hastings61272b72014-01-07 12:41:53 -080012985/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012986
Larry Hastings31826802013-10-19 00:09:25 -070012987@staticmethod
12988str.maketrans as unicode_maketrans
12989
12990 x: object
12991
12992 y: unicode=NULL
12993
12994 z: unicode=NULL
12995
12996 /
12997
12998Return a translation table usable for str.translate().
12999
13000If there is only one argument, it must be a dictionary mapping Unicode
13001ordinals (integers) or characters to Unicode ordinals, strings or None.
13002Character keys will be then converted to ordinals.
13003If there are two arguments, they must be strings of equal length, and
13004in the resulting dictionary, each character in x will be mapped to the
13005character at the same position in y. If there is a third argument, it
13006must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013007[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013008
Larry Hastings31826802013-10-19 00:09:25 -070013009static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013010unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013011/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013012{
Georg Brandlceee0772007-11-27 23:48:05 +000013013 PyObject *new = NULL, *key, *value;
13014 Py_ssize_t i = 0;
13015 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016
Georg Brandlceee0772007-11-27 23:48:05 +000013017 new = PyDict_New();
13018 if (!new)
13019 return NULL;
13020 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 int x_kind, y_kind, z_kind;
13022 void *x_data, *y_data, *z_data;
13023
Georg Brandlceee0772007-11-27 23:48:05 +000013024 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013025 if (!PyUnicode_Check(x)) {
13026 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13027 "be a string if there is a second argument");
13028 goto err;
13029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013031 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13032 "arguments must have equal length");
13033 goto err;
13034 }
13035 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 x_kind = PyUnicode_KIND(x);
13037 y_kind = PyUnicode_KIND(y);
13038 x_data = PyUnicode_DATA(x);
13039 y_data = PyUnicode_DATA(y);
13040 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13041 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013042 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013043 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013044 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013045 if (!value) {
13046 Py_DECREF(key);
13047 goto err;
13048 }
Georg Brandlceee0772007-11-27 23:48:05 +000013049 res = PyDict_SetItem(new, key, value);
13050 Py_DECREF(key);
13051 Py_DECREF(value);
13052 if (res < 0)
13053 goto err;
13054 }
13055 /* create entries for deleting chars in z */
13056 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 z_kind = PyUnicode_KIND(z);
13058 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013059 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013061 if (!key)
13062 goto err;
13063 res = PyDict_SetItem(new, key, Py_None);
13064 Py_DECREF(key);
13065 if (res < 0)
13066 goto err;
13067 }
13068 }
13069 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 int kind;
13071 void *data;
13072
Georg Brandlceee0772007-11-27 23:48:05 +000013073 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013074 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013075 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13076 "to maketrans it must be a dict");
13077 goto err;
13078 }
13079 /* copy entries into the new dict, converting string keys to int keys */
13080 while (PyDict_Next(x, &i, &key, &value)) {
13081 if (PyUnicode_Check(key)) {
13082 /* convert string keys to integer keys */
13083 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013084 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013085 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13086 "table must be of length 1");
13087 goto err;
13088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013089 kind = PyUnicode_KIND(key);
13090 data = PyUnicode_DATA(key);
13091 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013092 if (!newkey)
13093 goto err;
13094 res = PyDict_SetItem(new, newkey, value);
13095 Py_DECREF(newkey);
13096 if (res < 0)
13097 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013098 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013099 /* just keep integer keys */
13100 if (PyDict_SetItem(new, key, value) < 0)
13101 goto err;
13102 } else {
13103 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13104 "be strings or integers");
13105 goto err;
13106 }
13107 }
13108 }
13109 return new;
13110 err:
13111 Py_DECREF(new);
13112 return NULL;
13113}
13114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013115PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013118Return a copy of the string S in which each character has been mapped\n\
13119through the given translation table. The table must implement\n\
13120lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13121mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13122this operation raises LookupError, the character is left untouched.\n\
13123Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
13125static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129}
13130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013131PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013134Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135
13136static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013137unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013139 if (PyUnicode_READY(self) == -1)
13140 return NULL;
13141 if (PyUnicode_IS_ASCII(self))
13142 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013143 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144}
13145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013146PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013149Pad a numeric string S with zeros on the left, to fill a field\n\
13150of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151
13152static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013153unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013155 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013156 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013157 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 int kind;
13159 void *data;
13160 Py_UCS4 chr;
13161
Martin v. Löwis18e16552006-02-15 17:27:45 +000013162 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163 return NULL;
13164
Benjamin Petersonbac79492012-01-14 13:34:47 -050013165 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167
Victor Stinnerc4b49542011-12-11 22:44:26 +010013168 if (PyUnicode_GET_LENGTH(self) >= width)
13169 return unicode_result_unchanged(self);
13170
13171 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172
13173 u = pad(self, fill, 0, '0');
13174
Walter Dörwald068325e2002-04-15 13:36:47 +000013175 if (u == NULL)
13176 return NULL;
13177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 kind = PyUnicode_KIND(u);
13179 data = PyUnicode_DATA(u);
13180 chr = PyUnicode_READ(kind, data, fill);
13181
13182 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 PyUnicode_WRITE(kind, data, 0, chr);
13185 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186 }
13187
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013188 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013189 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191
13192#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013193static PyObject *
13194unicode__decimal2ascii(PyObject *self)
13195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013197}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198#endif
13199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013200PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013203Return True if S starts with the specified prefix, False otherwise.\n\
13204With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013205With optional end, stop comparing S at that position.\n\
13206prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207
13208static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013209unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013213 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013214 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013215 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013216 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217
Jesus Ceaac451502011-04-20 17:09:23 +020013218 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013219 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013220 if (PyTuple_Check(subobj)) {
13221 Py_ssize_t i;
13222 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223 substring = PyTuple_GET_ITEM(subobj, i);
13224 if (!PyUnicode_Check(substring)) {
13225 PyErr_Format(PyExc_TypeError,
13226 "tuple for startswith must only contain str, "
13227 "not %.100s",
13228 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013229 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013230 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013231 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013232 if (result == -1)
13233 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013234 if (result) {
13235 Py_RETURN_TRUE;
13236 }
13237 }
13238 /* nothing matched */
13239 Py_RETURN_FALSE;
13240 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013241 if (!PyUnicode_Check(subobj)) {
13242 PyErr_Format(PyExc_TypeError,
13243 "startswith first arg must be str or "
13244 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013245 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013246 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013247 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013248 if (result == -1)
13249 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013250 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251}
13252
13253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013254PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013257Return True if S ends with the specified suffix, False otherwise.\n\
13258With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013259With optional end, stop comparing S at that position.\n\
13260suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261
13262static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013263unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013266 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013267 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013268 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013269 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013270 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271
Jesus Ceaac451502011-04-20 17:09:23 +020013272 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013274 if (PyTuple_Check(subobj)) {
13275 Py_ssize_t i;
13276 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013277 substring = PyTuple_GET_ITEM(subobj, i);
13278 if (!PyUnicode_Check(substring)) {
13279 PyErr_Format(PyExc_TypeError,
13280 "tuple for endswith must only contain str, "
13281 "not %.100s",
13282 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013284 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013285 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013286 if (result == -1)
13287 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013288 if (result) {
13289 Py_RETURN_TRUE;
13290 }
13291 }
13292 Py_RETURN_FALSE;
13293 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013294 if (!PyUnicode_Check(subobj)) {
13295 PyErr_Format(PyExc_TypeError,
13296 "endswith first arg must be str or "
13297 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013299 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013300 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013301 if (result == -1)
13302 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013303 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304}
13305
Victor Stinner202fdca2012-05-07 12:47:02 +020013306Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013307_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013308{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013309 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13310 writer->data = PyUnicode_DATA(writer->buffer);
13311
13312 if (!writer->readonly) {
13313 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013314 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013315 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013316 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013317 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13318 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13319 writer->kind = PyUnicode_WCHAR_KIND;
13320 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13321
Victor Stinner8f674cc2013-04-17 23:02:17 +020013322 /* Copy-on-write mode: set buffer size to 0 so
13323 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13324 * next write. */
13325 writer->size = 0;
13326 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013327}
13328
Victor Stinnerd3f08822012-05-29 12:57:52 +020013329void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013330_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013331{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013332 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013333
13334 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013335 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013336
13337 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13338 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13339 writer->kind = PyUnicode_WCHAR_KIND;
13340 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013341}
13342
Victor Stinnerd3f08822012-05-29 12:57:52 +020013343int
13344_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13345 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013346{
13347 Py_ssize_t newlen;
13348 PyObject *newbuffer;
13349
Victor Stinnerca9381e2015-09-22 00:58:32 +020013350 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013351 assert((maxchar > writer->maxchar && length >= 0)
13352 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013353
Victor Stinner202fdca2012-05-07 12:47:02 +020013354 if (length > PY_SSIZE_T_MAX - writer->pos) {
13355 PyErr_NoMemory();
13356 return -1;
13357 }
13358 newlen = writer->pos + length;
13359
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013360 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013361
Victor Stinnerd3f08822012-05-29 12:57:52 +020013362 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013363 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013364 if (writer->overallocate
13365 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13366 /* overallocate to limit the number of realloc() */
13367 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013368 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013369 if (newlen < writer->min_length)
13370 newlen = writer->min_length;
13371
Victor Stinnerd3f08822012-05-29 12:57:52 +020013372 writer->buffer = PyUnicode_New(newlen, maxchar);
13373 if (writer->buffer == NULL)
13374 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013375 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013376 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013377 if (writer->overallocate
13378 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13379 /* overallocate to limit the number of realloc() */
13380 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013381 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013382 if (newlen < writer->min_length)
13383 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013384
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013385 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013386 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013387 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013388 newbuffer = PyUnicode_New(newlen, maxchar);
13389 if (newbuffer == NULL)
13390 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013391 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13392 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013393 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013394 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013395 }
13396 else {
13397 newbuffer = resize_compact(writer->buffer, newlen);
13398 if (newbuffer == NULL)
13399 return -1;
13400 }
13401 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013402 }
13403 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013404 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013405 newbuffer = PyUnicode_New(writer->size, maxchar);
13406 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013407 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013408 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13409 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013410 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013411 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013412 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013413 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013414
13415#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013416}
13417
Victor Stinnerca9381e2015-09-22 00:58:32 +020013418int
13419_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13420 enum PyUnicode_Kind kind)
13421{
13422 Py_UCS4 maxchar;
13423
13424 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13425 assert(writer->kind < kind);
13426
13427 switch (kind)
13428 {
13429 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13430 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13431 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13432 default:
13433 assert(0 && "invalid kind");
13434 return -1;
13435 }
13436
13437 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13438}
13439
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013440Py_LOCAL_INLINE(int)
13441_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013442{
13443 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13444 return -1;
13445 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13446 writer->pos++;
13447 return 0;
13448}
13449
13450int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013451_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13452{
13453 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13454}
13455
13456int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013457_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13458{
13459 Py_UCS4 maxchar;
13460 Py_ssize_t len;
13461
13462 if (PyUnicode_READY(str) == -1)
13463 return -1;
13464 len = PyUnicode_GET_LENGTH(str);
13465 if (len == 0)
13466 return 0;
13467 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13468 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013469 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013470 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013471 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013472 Py_INCREF(str);
13473 writer->buffer = str;
13474 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013475 writer->pos += len;
13476 return 0;
13477 }
13478 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13479 return -1;
13480 }
13481 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13482 str, 0, len);
13483 writer->pos += len;
13484 return 0;
13485}
13486
Victor Stinnere215d962012-10-06 23:03:36 +020013487int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013488_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13489 Py_ssize_t start, Py_ssize_t end)
13490{
13491 Py_UCS4 maxchar;
13492 Py_ssize_t len;
13493
13494 if (PyUnicode_READY(str) == -1)
13495 return -1;
13496
13497 assert(0 <= start);
13498 assert(end <= PyUnicode_GET_LENGTH(str));
13499 assert(start <= end);
13500
13501 if (end == 0)
13502 return 0;
13503
13504 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13505 return _PyUnicodeWriter_WriteStr(writer, str);
13506
13507 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13508 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13509 else
13510 maxchar = writer->maxchar;
13511 len = end - start;
13512
13513 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13514 return -1;
13515
13516 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13517 str, start, len);
13518 writer->pos += len;
13519 return 0;
13520}
13521
13522int
Victor Stinner4a587072013-11-19 12:54:53 +010013523_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13524 const char *ascii, Py_ssize_t len)
13525{
13526 if (len == -1)
13527 len = strlen(ascii);
13528
13529 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13530
13531 if (writer->buffer == NULL && !writer->overallocate) {
13532 PyObject *str;
13533
13534 str = _PyUnicode_FromASCII(ascii, len);
13535 if (str == NULL)
13536 return -1;
13537
13538 writer->readonly = 1;
13539 writer->buffer = str;
13540 _PyUnicodeWriter_Update(writer);
13541 writer->pos += len;
13542 return 0;
13543 }
13544
13545 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13546 return -1;
13547
13548 switch (writer->kind)
13549 {
13550 case PyUnicode_1BYTE_KIND:
13551 {
13552 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13553 Py_UCS1 *data = writer->data;
13554
13555 Py_MEMCPY(data + writer->pos, str, len);
13556 break;
13557 }
13558 case PyUnicode_2BYTE_KIND:
13559 {
13560 _PyUnicode_CONVERT_BYTES(
13561 Py_UCS1, Py_UCS2,
13562 ascii, ascii + len,
13563 (Py_UCS2 *)writer->data + writer->pos);
13564 break;
13565 }
13566 case PyUnicode_4BYTE_KIND:
13567 {
13568 _PyUnicode_CONVERT_BYTES(
13569 Py_UCS1, Py_UCS4,
13570 ascii, ascii + len,
13571 (Py_UCS4 *)writer->data + writer->pos);
13572 break;
13573 }
13574 default:
13575 assert(0);
13576 }
13577
13578 writer->pos += len;
13579 return 0;
13580}
13581
13582int
13583_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13584 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013585{
13586 Py_UCS4 maxchar;
13587
13588 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13589 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13590 return -1;
13591 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13592 writer->pos += len;
13593 return 0;
13594}
13595
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013597_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013598{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013599 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013600 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013601 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013602 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013603 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013604 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013605 str = writer->buffer;
13606 writer->buffer = NULL;
13607 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13608 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013609 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013610 if (writer->pos == 0) {
13611 Py_CLEAR(writer->buffer);
13612
13613 /* Get the empty Unicode string singleton ('') */
13614 _Py_INCREF_UNICODE_EMPTY();
13615 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013616 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013617 else {
13618 str = writer->buffer;
13619 writer->buffer = NULL;
13620
13621 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13622 PyObject *str2;
13623 str2 = resize_compact(str, writer->pos);
13624 if (str2 == NULL)
13625 return NULL;
13626 str = str2;
13627 }
13628 }
13629
Victor Stinner15a0bd32013-07-08 22:29:55 +020013630 assert(_PyUnicode_CheckConsistency(str, 1));
13631 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013632}
13633
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013635_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013636{
13637 Py_CLEAR(writer->buffer);
13638}
13639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013640#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013641
13642PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013644\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013645Return a formatted version of S, using substitutions from args and kwargs.\n\
13646The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013647
Eric Smith27bbca62010-11-04 17:06:58 +000013648PyDoc_STRVAR(format_map__doc__,
13649 "S.format_map(mapping) -> str\n\
13650\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013651Return a formatted version of S, using substitutions from mapping.\n\
13652The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013653
Eric Smith4a7d76d2008-05-30 18:10:19 +000013654static PyObject *
13655unicode__format__(PyObject* self, PyObject* args)
13656{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013657 PyObject *format_spec;
13658 _PyUnicodeWriter writer;
13659 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013660
13661 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13662 return NULL;
13663
Victor Stinnerd3f08822012-05-29 12:57:52 +020013664 if (PyUnicode_READY(self) == -1)
13665 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013666 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013667 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13668 self, format_spec, 0,
13669 PyUnicode_GET_LENGTH(format_spec));
13670 if (ret == -1) {
13671 _PyUnicodeWriter_Dealloc(&writer);
13672 return NULL;
13673 }
13674 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013675}
13676
Eric Smith8c663262007-08-25 02:26:07 +000013677PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013679\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013680Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013681
13682static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013683unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 Py_ssize_t size;
13686
13687 /* If it's a compact object, account for base structure +
13688 character data. */
13689 if (PyUnicode_IS_COMPACT_ASCII(v))
13690 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13691 else if (PyUnicode_IS_COMPACT(v))
13692 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013693 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013694 else {
13695 /* If it is a two-block object, account for base object, and
13696 for character block if present. */
13697 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013698 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013700 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013701 }
13702 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013703 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013704 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013705 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013706 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013707 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013708
13709 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013710}
13711
13712PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013714
13715static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013716unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013717{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013718 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013719 if (!copy)
13720 return NULL;
13721 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013722}
13723
Guido van Rossumd57fd912000-03-10 22:53:23 +000013724static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013725 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013726 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013727 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13728 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013729 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13730 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013731 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013732 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13733 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13734 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013735 {"expandtabs", (PyCFunction) unicode_expandtabs,
13736 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013737 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013738 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013739 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13740 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13741 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013742 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013743 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13744 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13745 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013746 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013747 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013748 {"splitlines", (PyCFunction) unicode_splitlines,
13749 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013750 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013751 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13752 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13753 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13754 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13755 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13756 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13757 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13758 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13759 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13760 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13761 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13762 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13763 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13764 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013765 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013766 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013767 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013768 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013769 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013770 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013771 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013772 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013773#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013774 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013775 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013776#endif
13777
Benjamin Peterson14339b62009-01-31 16:36:08 +000013778 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779 {NULL, NULL}
13780};
13781
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013782static PyObject *
13783unicode_mod(PyObject *v, PyObject *w)
13784{
Brian Curtindfc80e32011-08-10 20:28:54 -050013785 if (!PyUnicode_Check(v))
13786 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013787 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013788}
13789
13790static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013791 0, /*nb_add*/
13792 0, /*nb_subtract*/
13793 0, /*nb_multiply*/
13794 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013795};
13796
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013798 (lenfunc) unicode_length, /* sq_length */
13799 PyUnicode_Concat, /* sq_concat */
13800 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13801 (ssizeargfunc) unicode_getitem, /* sq_item */
13802 0, /* sq_slice */
13803 0, /* sq_ass_item */
13804 0, /* sq_ass_slice */
13805 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806};
13807
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013808static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013809unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013811 if (PyUnicode_READY(self) == -1)
13812 return NULL;
13813
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013814 if (PyIndex_Check(item)) {
13815 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013816 if (i == -1 && PyErr_Occurred())
13817 return NULL;
13818 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013819 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013820 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013821 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013822 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013823 PyObject *result;
13824 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013825 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013826 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013830 return NULL;
13831 }
13832
13833 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013834 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013835 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013836 slicelength == PyUnicode_GET_LENGTH(self)) {
13837 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013838 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013839 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013840 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013841 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013842 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013843 src_kind = PyUnicode_KIND(self);
13844 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013845 if (!PyUnicode_IS_ASCII(self)) {
13846 kind_limit = kind_maxchar_limit(src_kind);
13847 max_char = 0;
13848 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13849 ch = PyUnicode_READ(src_kind, src_data, cur);
13850 if (ch > max_char) {
13851 max_char = ch;
13852 if (max_char >= kind_limit)
13853 break;
13854 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013855 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013856 }
Victor Stinner55c99112011-10-13 01:17:06 +020013857 else
13858 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013859 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013860 if (result == NULL)
13861 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013862 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013863 dest_data = PyUnicode_DATA(result);
13864
13865 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013866 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13867 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013868 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013869 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013870 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013871 } else {
13872 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13873 return NULL;
13874 }
13875}
13876
13877static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013878 (lenfunc)unicode_length, /* mp_length */
13879 (binaryfunc)unicode_subscript, /* mp_subscript */
13880 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013881};
13882
Guido van Rossumd57fd912000-03-10 22:53:23 +000013883
Guido van Rossumd57fd912000-03-10 22:53:23 +000013884/* Helpers for PyUnicode_Format() */
13885
Victor Stinnera47082312012-10-04 02:19:54 +020013886struct unicode_formatter_t {
13887 PyObject *args;
13888 int args_owned;
13889 Py_ssize_t arglen, argidx;
13890 PyObject *dict;
13891
13892 enum PyUnicode_Kind fmtkind;
13893 Py_ssize_t fmtcnt, fmtpos;
13894 void *fmtdata;
13895 PyObject *fmtstr;
13896
13897 _PyUnicodeWriter writer;
13898};
13899
13900struct unicode_format_arg_t {
13901 Py_UCS4 ch;
13902 int flags;
13903 Py_ssize_t width;
13904 int prec;
13905 int sign;
13906};
13907
Guido van Rossumd57fd912000-03-10 22:53:23 +000013908static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013909unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910{
Victor Stinnera47082312012-10-04 02:19:54 +020013911 Py_ssize_t argidx = ctx->argidx;
13912
13913 if (argidx < ctx->arglen) {
13914 ctx->argidx++;
13915 if (ctx->arglen < 0)
13916 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013917 else
Victor Stinnera47082312012-10-04 02:19:54 +020013918 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013919 }
13920 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 return NULL;
13923}
13924
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013925/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926
Victor Stinnera47082312012-10-04 02:19:54 +020013927/* Format a float into the writer if the writer is not NULL, or into *p_output
13928 otherwise.
13929
13930 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013931static int
Victor Stinnera47082312012-10-04 02:19:54 +020013932formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13933 PyObject **p_output,
13934 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013936 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013938 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013939 int prec;
13940 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013941
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942 x = PyFloat_AsDouble(v);
13943 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013945
Victor Stinnera47082312012-10-04 02:19:54 +020013946 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013949
Victor Stinnera47082312012-10-04 02:19:54 +020013950 if (arg->flags & F_ALT)
13951 dtoa_flags = Py_DTSF_ALT;
13952 else
13953 dtoa_flags = 0;
13954 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013955 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013956 return -1;
13957 len = strlen(p);
13958 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013959 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013960 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013961 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013962 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013963 }
13964 else
13965 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013966 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013967 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968}
13969
Victor Stinnerd0880d52012-04-27 23:40:13 +020013970/* formatlong() emulates the format codes d, u, o, x and X, and
13971 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13972 * Python's regular ints.
13973 * Return value: a new PyUnicodeObject*, or NULL if error.
13974 * The output string is of the form
13975 * "-"? ("0x" | "0X")? digit+
13976 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13977 * set in flags. The case of hex digits will be correct,
13978 * There will be at least prec digits, zero-filled on the left if
13979 * necessary to get that many.
13980 * val object to be converted
13981 * flags bitmask of format flags; only F_ALT is looked at
13982 * prec minimum number of digits; 0-fill on left if needed
13983 * type a character in [duoxX]; u acts the same as d
13984 *
13985 * CAUTION: o, x and X conversions on regular ints can never
13986 * produce a '-' sign, but can for Python's unbounded ints.
13987 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013988PyObject *
13989_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013990{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013991 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013993 Py_ssize_t i;
13994 int sign; /* 1 if '-', else 0 */
13995 int len; /* number of characters */
13996 Py_ssize_t llen;
13997 int numdigits; /* len == numnondigits + numdigits */
13998 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013999
Victor Stinnerd0880d52012-04-27 23:40:13 +020014000 /* Avoid exceeding SSIZE_T_MAX */
14001 if (prec > INT_MAX-3) {
14002 PyErr_SetString(PyExc_OverflowError,
14003 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014005 }
14006
14007 assert(PyLong_Check(val));
14008
14009 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014010 default:
14011 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014012 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014013 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014014 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014015 /* int and int subclasses should print numerically when a numeric */
14016 /* format code is used (see issue18780) */
14017 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014018 break;
14019 case 'o':
14020 numnondigits = 2;
14021 result = PyNumber_ToBase(val, 8);
14022 break;
14023 case 'x':
14024 case 'X':
14025 numnondigits = 2;
14026 result = PyNumber_ToBase(val, 16);
14027 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014028 }
14029 if (!result)
14030 return NULL;
14031
14032 assert(unicode_modifiable(result));
14033 assert(PyUnicode_IS_READY(result));
14034 assert(PyUnicode_IS_ASCII(result));
14035
14036 /* To modify the string in-place, there can only be one reference. */
14037 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014038 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014039 PyErr_BadInternalCall();
14040 return NULL;
14041 }
14042 buf = PyUnicode_DATA(result);
14043 llen = PyUnicode_GET_LENGTH(result);
14044 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014045 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014046 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014047 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014048 return NULL;
14049 }
14050 len = (int)llen;
14051 sign = buf[0] == '-';
14052 numnondigits += sign;
14053 numdigits = len - numnondigits;
14054 assert(numdigits > 0);
14055
14056 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014057 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014058 (type == 'o' || type == 'x' || type == 'X'))) {
14059 assert(buf[sign] == '0');
14060 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14061 buf[sign+1] == 'o');
14062 numnondigits -= 2;
14063 buf += 2;
14064 len -= 2;
14065 if (sign)
14066 buf[0] = '-';
14067 assert(len == numnondigits + numdigits);
14068 assert(numdigits > 0);
14069 }
14070
14071 /* Fill with leading zeroes to meet minimum width. */
14072 if (prec > numdigits) {
14073 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14074 numnondigits + prec);
14075 char *b1;
14076 if (!r1) {
14077 Py_DECREF(result);
14078 return NULL;
14079 }
14080 b1 = PyBytes_AS_STRING(r1);
14081 for (i = 0; i < numnondigits; ++i)
14082 *b1++ = *buf++;
14083 for (i = 0; i < prec - numdigits; i++)
14084 *b1++ = '0';
14085 for (i = 0; i < numdigits; i++)
14086 *b1++ = *buf++;
14087 *b1 = '\0';
14088 Py_DECREF(result);
14089 result = r1;
14090 buf = PyBytes_AS_STRING(result);
14091 len = numnondigits + prec;
14092 }
14093
14094 /* Fix up case for hex conversions. */
14095 if (type == 'X') {
14096 /* Need to convert all lower case letters to upper case.
14097 and need to convert 0x to 0X (and -0x to -0X). */
14098 for (i = 0; i < len; i++)
14099 if (buf[i] >= 'a' && buf[i] <= 'x')
14100 buf[i] -= 'a'-'A';
14101 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102 if (!PyUnicode_Check(result)
14103 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014104 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014105 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014106 Py_DECREF(result);
14107 result = unicode;
14108 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014109 else if (len != PyUnicode_GET_LENGTH(result)) {
14110 if (PyUnicode_Resize(&result, len) < 0)
14111 Py_CLEAR(result);
14112 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014114}
14115
Ethan Furmandf3ed242014-01-05 06:50:30 -080014116/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014117 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014118 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014119 * -1 and raise an exception on error */
14120static int
Victor Stinnera47082312012-10-04 02:19:54 +020014121mainformatlong(PyObject *v,
14122 struct unicode_format_arg_t *arg,
14123 PyObject **p_output,
14124 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014125{
14126 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014127 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014128
14129 if (!PyNumber_Check(v))
14130 goto wrongtype;
14131
Ethan Furman9ab74802014-03-21 06:38:46 -070014132 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014133 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014134 if (type == 'o' || type == 'x' || type == 'X') {
14135 iobj = PyNumber_Index(v);
14136 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014137 if (PyErr_ExceptionMatches(PyExc_TypeError))
14138 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014139 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014140 }
14141 }
14142 else {
14143 iobj = PyNumber_Long(v);
14144 if (iobj == NULL ) {
14145 if (PyErr_ExceptionMatches(PyExc_TypeError))
14146 goto wrongtype;
14147 return -1;
14148 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014149 }
14150 assert(PyLong_Check(iobj));
14151 }
14152 else {
14153 iobj = v;
14154 Py_INCREF(iobj);
14155 }
14156
14157 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014158 && arg->width == -1 && arg->prec == -1
14159 && !(arg->flags & (F_SIGN | F_BLANK))
14160 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014161 {
14162 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014163 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014164 int base;
14165
Victor Stinnera47082312012-10-04 02:19:54 +020014166 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014167 {
14168 default:
14169 assert(0 && "'type' not in [diuoxX]");
14170 case 'd':
14171 case 'i':
14172 case 'u':
14173 base = 10;
14174 break;
14175 case 'o':
14176 base = 8;
14177 break;
14178 case 'x':
14179 case 'X':
14180 base = 16;
14181 break;
14182 }
14183
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014184 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14185 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014186 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014187 }
14188 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014189 return 1;
14190 }
14191
Ethan Furmanb95b5612015-01-23 20:05:18 -080014192 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014193 Py_DECREF(iobj);
14194 if (res == NULL)
14195 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014196 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014197 return 0;
14198
14199wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014200 switch(type)
14201 {
14202 case 'o':
14203 case 'x':
14204 case 'X':
14205 PyErr_Format(PyExc_TypeError,
14206 "%%%c format: an integer is required, "
14207 "not %.200s",
14208 type, Py_TYPE(v)->tp_name);
14209 break;
14210 default:
14211 PyErr_Format(PyExc_TypeError,
14212 "%%%c format: a number is required, "
14213 "not %.200s",
14214 type, Py_TYPE(v)->tp_name);
14215 break;
14216 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014217 return -1;
14218}
14219
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014220static Py_UCS4
14221formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014222{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014223 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014224 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014225 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014226 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014227 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014228 goto onError;
14229 }
14230 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014231 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014232 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014233 /* make sure number is a type of integer */
14234 if (!PyLong_Check(v)) {
14235 iobj = PyNumber_Index(v);
14236 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014237 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014238 }
14239 v = iobj;
14240 Py_DECREF(iobj);
14241 }
14242 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014243 x = PyLong_AsLong(v);
14244 if (x == -1 && PyErr_Occurred())
14245 goto onError;
14246
Victor Stinner8faf8212011-12-08 22:14:11 +010014247 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014248 PyErr_SetString(PyExc_OverflowError,
14249 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014250 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014251 }
14252
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014253 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014254 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014255
Benjamin Peterson29060642009-01-31 22:14:21 +000014256 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014257 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014258 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014259 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014260}
14261
Victor Stinnera47082312012-10-04 02:19:54 +020014262/* Parse options of an argument: flags, width, precision.
14263 Handle also "%(name)" syntax.
14264
14265 Return 0 if the argument has been formatted into arg->str.
14266 Return 1 if the argument has been written into ctx->writer,
14267 Raise an exception and return -1 on error. */
14268static int
14269unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14270 struct unicode_format_arg_t *arg)
14271{
14272#define FORMAT_READ(ctx) \
14273 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14274
14275 PyObject *v;
14276
Victor Stinnera47082312012-10-04 02:19:54 +020014277 if (arg->ch == '(') {
14278 /* Get argument value from a dictionary. Example: "%(name)s". */
14279 Py_ssize_t keystart;
14280 Py_ssize_t keylen;
14281 PyObject *key;
14282 int pcount = 1;
14283
14284 if (ctx->dict == NULL) {
14285 PyErr_SetString(PyExc_TypeError,
14286 "format requires a mapping");
14287 return -1;
14288 }
14289 ++ctx->fmtpos;
14290 --ctx->fmtcnt;
14291 keystart = ctx->fmtpos;
14292 /* Skip over balanced parentheses */
14293 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14294 arg->ch = FORMAT_READ(ctx);
14295 if (arg->ch == ')')
14296 --pcount;
14297 else if (arg->ch == '(')
14298 ++pcount;
14299 ctx->fmtpos++;
14300 }
14301 keylen = ctx->fmtpos - keystart - 1;
14302 if (ctx->fmtcnt < 0 || pcount > 0) {
14303 PyErr_SetString(PyExc_ValueError,
14304 "incomplete format key");
14305 return -1;
14306 }
14307 key = PyUnicode_Substring(ctx->fmtstr,
14308 keystart, keystart + keylen);
14309 if (key == NULL)
14310 return -1;
14311 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014312 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014313 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014314 }
14315 ctx->args = PyObject_GetItem(ctx->dict, key);
14316 Py_DECREF(key);
14317 if (ctx->args == NULL)
14318 return -1;
14319 ctx->args_owned = 1;
14320 ctx->arglen = -1;
14321 ctx->argidx = -2;
14322 }
14323
14324 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014325 while (--ctx->fmtcnt >= 0) {
14326 arg->ch = FORMAT_READ(ctx);
14327 ctx->fmtpos++;
14328 switch (arg->ch) {
14329 case '-': arg->flags |= F_LJUST; continue;
14330 case '+': arg->flags |= F_SIGN; continue;
14331 case ' ': arg->flags |= F_BLANK; continue;
14332 case '#': arg->flags |= F_ALT; continue;
14333 case '0': arg->flags |= F_ZERO; continue;
14334 }
14335 break;
14336 }
14337
14338 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014339 if (arg->ch == '*') {
14340 v = unicode_format_getnextarg(ctx);
14341 if (v == NULL)
14342 return -1;
14343 if (!PyLong_Check(v)) {
14344 PyErr_SetString(PyExc_TypeError,
14345 "* wants int");
14346 return -1;
14347 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014348 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014349 if (arg->width == -1 && PyErr_Occurred())
14350 return -1;
14351 if (arg->width < 0) {
14352 arg->flags |= F_LJUST;
14353 arg->width = -arg->width;
14354 }
14355 if (--ctx->fmtcnt >= 0) {
14356 arg->ch = FORMAT_READ(ctx);
14357 ctx->fmtpos++;
14358 }
14359 }
14360 else if (arg->ch >= '0' && arg->ch <= '9') {
14361 arg->width = arg->ch - '0';
14362 while (--ctx->fmtcnt >= 0) {
14363 arg->ch = FORMAT_READ(ctx);
14364 ctx->fmtpos++;
14365 if (arg->ch < '0' || arg->ch > '9')
14366 break;
14367 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14368 mixing signed and unsigned comparison. Since arg->ch is between
14369 '0' and '9', casting to int is safe. */
14370 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14371 PyErr_SetString(PyExc_ValueError,
14372 "width too big");
14373 return -1;
14374 }
14375 arg->width = arg->width*10 + (arg->ch - '0');
14376 }
14377 }
14378
14379 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014380 if (arg->ch == '.') {
14381 arg->prec = 0;
14382 if (--ctx->fmtcnt >= 0) {
14383 arg->ch = FORMAT_READ(ctx);
14384 ctx->fmtpos++;
14385 }
14386 if (arg->ch == '*') {
14387 v = unicode_format_getnextarg(ctx);
14388 if (v == NULL)
14389 return -1;
14390 if (!PyLong_Check(v)) {
14391 PyErr_SetString(PyExc_TypeError,
14392 "* wants int");
14393 return -1;
14394 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014395 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014396 if (arg->prec == -1 && PyErr_Occurred())
14397 return -1;
14398 if (arg->prec < 0)
14399 arg->prec = 0;
14400 if (--ctx->fmtcnt >= 0) {
14401 arg->ch = FORMAT_READ(ctx);
14402 ctx->fmtpos++;
14403 }
14404 }
14405 else if (arg->ch >= '0' && arg->ch <= '9') {
14406 arg->prec = arg->ch - '0';
14407 while (--ctx->fmtcnt >= 0) {
14408 arg->ch = FORMAT_READ(ctx);
14409 ctx->fmtpos++;
14410 if (arg->ch < '0' || arg->ch > '9')
14411 break;
14412 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14413 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014414 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014415 return -1;
14416 }
14417 arg->prec = arg->prec*10 + (arg->ch - '0');
14418 }
14419 }
14420 }
14421
14422 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14423 if (ctx->fmtcnt >= 0) {
14424 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14425 if (--ctx->fmtcnt >= 0) {
14426 arg->ch = FORMAT_READ(ctx);
14427 ctx->fmtpos++;
14428 }
14429 }
14430 }
14431 if (ctx->fmtcnt < 0) {
14432 PyErr_SetString(PyExc_ValueError,
14433 "incomplete format");
14434 return -1;
14435 }
14436 return 0;
14437
14438#undef FORMAT_READ
14439}
14440
14441/* Format one argument. Supported conversion specifiers:
14442
14443 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014444 - "i", "d", "u": int or float
14445 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014446 - "e", "E", "f", "F", "g", "G": float
14447 - "c": int or str (1 character)
14448
Victor Stinner8dbd4212012-12-04 09:30:24 +010014449 When possible, the output is written directly into the Unicode writer
14450 (ctx->writer). A string is created when padding is required.
14451
Victor Stinnera47082312012-10-04 02:19:54 +020014452 Return 0 if the argument has been formatted into *p_str,
14453 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014454 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014455static int
14456unicode_format_arg_format(struct unicode_formatter_t *ctx,
14457 struct unicode_format_arg_t *arg,
14458 PyObject **p_str)
14459{
14460 PyObject *v;
14461 _PyUnicodeWriter *writer = &ctx->writer;
14462
14463 if (ctx->fmtcnt == 0)
14464 ctx->writer.overallocate = 0;
14465
14466 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014467 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014468 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014469 return 1;
14470 }
14471
14472 v = unicode_format_getnextarg(ctx);
14473 if (v == NULL)
14474 return -1;
14475
Victor Stinnera47082312012-10-04 02:19:54 +020014476
14477 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014478 case 's':
14479 case 'r':
14480 case 'a':
14481 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14482 /* Fast path */
14483 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14484 return -1;
14485 return 1;
14486 }
14487
14488 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14489 *p_str = v;
14490 Py_INCREF(*p_str);
14491 }
14492 else {
14493 if (arg->ch == 's')
14494 *p_str = PyObject_Str(v);
14495 else if (arg->ch == 'r')
14496 *p_str = PyObject_Repr(v);
14497 else
14498 *p_str = PyObject_ASCII(v);
14499 }
14500 break;
14501
14502 case 'i':
14503 case 'd':
14504 case 'u':
14505 case 'o':
14506 case 'x':
14507 case 'X':
14508 {
14509 int ret = mainformatlong(v, arg, p_str, writer);
14510 if (ret != 0)
14511 return ret;
14512 arg->sign = 1;
14513 break;
14514 }
14515
14516 case 'e':
14517 case 'E':
14518 case 'f':
14519 case 'F':
14520 case 'g':
14521 case 'G':
14522 if (arg->width == -1 && arg->prec == -1
14523 && !(arg->flags & (F_SIGN | F_BLANK)))
14524 {
14525 /* Fast path */
14526 if (formatfloat(v, arg, NULL, writer) == -1)
14527 return -1;
14528 return 1;
14529 }
14530
14531 arg->sign = 1;
14532 if (formatfloat(v, arg, p_str, NULL) == -1)
14533 return -1;
14534 break;
14535
14536 case 'c':
14537 {
14538 Py_UCS4 ch = formatchar(v);
14539 if (ch == (Py_UCS4) -1)
14540 return -1;
14541 if (arg->width == -1 && arg->prec == -1) {
14542 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014543 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014544 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014545 return 1;
14546 }
14547 *p_str = PyUnicode_FromOrdinal(ch);
14548 break;
14549 }
14550
14551 default:
14552 PyErr_Format(PyExc_ValueError,
14553 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014554 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014555 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14556 (int)arg->ch,
14557 ctx->fmtpos - 1);
14558 return -1;
14559 }
14560 if (*p_str == NULL)
14561 return -1;
14562 assert (PyUnicode_Check(*p_str));
14563 return 0;
14564}
14565
14566static int
14567unicode_format_arg_output(struct unicode_formatter_t *ctx,
14568 struct unicode_format_arg_t *arg,
14569 PyObject *str)
14570{
14571 Py_ssize_t len;
14572 enum PyUnicode_Kind kind;
14573 void *pbuf;
14574 Py_ssize_t pindex;
14575 Py_UCS4 signchar;
14576 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014577 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014578 Py_ssize_t sublen;
14579 _PyUnicodeWriter *writer = &ctx->writer;
14580 Py_UCS4 fill;
14581
14582 fill = ' ';
14583 if (arg->sign && arg->flags & F_ZERO)
14584 fill = '0';
14585
14586 if (PyUnicode_READY(str) == -1)
14587 return -1;
14588
14589 len = PyUnicode_GET_LENGTH(str);
14590 if ((arg->width == -1 || arg->width <= len)
14591 && (arg->prec == -1 || arg->prec >= len)
14592 && !(arg->flags & (F_SIGN | F_BLANK)))
14593 {
14594 /* Fast path */
14595 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14596 return -1;
14597 return 0;
14598 }
14599
14600 /* Truncate the string for "s", "r" and "a" formats
14601 if the precision is set */
14602 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14603 if (arg->prec >= 0 && len > arg->prec)
14604 len = arg->prec;
14605 }
14606
14607 /* Adjust sign and width */
14608 kind = PyUnicode_KIND(str);
14609 pbuf = PyUnicode_DATA(str);
14610 pindex = 0;
14611 signchar = '\0';
14612 if (arg->sign) {
14613 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14614 if (ch == '-' || ch == '+') {
14615 signchar = ch;
14616 len--;
14617 pindex++;
14618 }
14619 else if (arg->flags & F_SIGN)
14620 signchar = '+';
14621 else if (arg->flags & F_BLANK)
14622 signchar = ' ';
14623 else
14624 arg->sign = 0;
14625 }
14626 if (arg->width < len)
14627 arg->width = len;
14628
14629 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014630 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014631 if (!(arg->flags & F_LJUST)) {
14632 if (arg->sign) {
14633 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014634 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014635 }
14636 else {
14637 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014638 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014639 }
14640 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014641 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14642 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014643 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014644 }
14645
Victor Stinnera47082312012-10-04 02:19:54 +020014646 buflen = arg->width;
14647 if (arg->sign && len == arg->width)
14648 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014649 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014650 return -1;
14651
14652 /* Write the sign if needed */
14653 if (arg->sign) {
14654 if (fill != ' ') {
14655 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14656 writer->pos += 1;
14657 }
14658 if (arg->width > len)
14659 arg->width--;
14660 }
14661
14662 /* Write the numeric prefix for "x", "X" and "o" formats
14663 if the alternate form is used.
14664 For example, write "0x" for the "%#x" format. */
14665 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14666 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14667 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14668 if (fill != ' ') {
14669 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14670 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14671 writer->pos += 2;
14672 pindex += 2;
14673 }
14674 arg->width -= 2;
14675 if (arg->width < 0)
14676 arg->width = 0;
14677 len -= 2;
14678 }
14679
14680 /* Pad left with the fill character if needed */
14681 if (arg->width > len && !(arg->flags & F_LJUST)) {
14682 sublen = arg->width - len;
14683 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14684 writer->pos += sublen;
14685 arg->width = len;
14686 }
14687
14688 /* If padding with spaces: write sign if needed and/or numeric prefix if
14689 the alternate form is used */
14690 if (fill == ' ') {
14691 if (arg->sign) {
14692 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14693 writer->pos += 1;
14694 }
14695 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14696 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14697 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14698 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14699 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14700 writer->pos += 2;
14701 pindex += 2;
14702 }
14703 }
14704
14705 /* Write characters */
14706 if (len) {
14707 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14708 str, pindex, len);
14709 writer->pos += len;
14710 }
14711
14712 /* Pad right with the fill character if needed */
14713 if (arg->width > len) {
14714 sublen = arg->width - len;
14715 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14716 writer->pos += sublen;
14717 }
14718 return 0;
14719}
14720
14721/* Helper of PyUnicode_Format(): format one arg.
14722 Return 0 on success, raise an exception and return -1 on error. */
14723static int
14724unicode_format_arg(struct unicode_formatter_t *ctx)
14725{
14726 struct unicode_format_arg_t arg;
14727 PyObject *str;
14728 int ret;
14729
Victor Stinner8dbd4212012-12-04 09:30:24 +010014730 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14731 arg.flags = 0;
14732 arg.width = -1;
14733 arg.prec = -1;
14734 arg.sign = 0;
14735 str = NULL;
14736
Victor Stinnera47082312012-10-04 02:19:54 +020014737 ret = unicode_format_arg_parse(ctx, &arg);
14738 if (ret == -1)
14739 return -1;
14740
14741 ret = unicode_format_arg_format(ctx, &arg, &str);
14742 if (ret == -1)
14743 return -1;
14744
14745 if (ret != 1) {
14746 ret = unicode_format_arg_output(ctx, &arg, str);
14747 Py_DECREF(str);
14748 if (ret == -1)
14749 return -1;
14750 }
14751
14752 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14753 PyErr_SetString(PyExc_TypeError,
14754 "not all arguments converted during string formatting");
14755 return -1;
14756 }
14757 return 0;
14758}
14759
Alexander Belopolsky40018472011-02-26 01:02:56 +000014760PyObject *
14761PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014762{
Victor Stinnera47082312012-10-04 02:19:54 +020014763 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014764
Guido van Rossumd57fd912000-03-10 22:53:23 +000014765 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014766 PyErr_BadInternalCall();
14767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014768 }
Victor Stinnera47082312012-10-04 02:19:54 +020014769
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014770 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014771 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014772
14773 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014774 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14775 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14776 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14777 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014778
Victor Stinner8f674cc2013-04-17 23:02:17 +020014779 _PyUnicodeWriter_Init(&ctx.writer);
14780 ctx.writer.min_length = ctx.fmtcnt + 100;
14781 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014782
Guido van Rossumd57fd912000-03-10 22:53:23 +000014783 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014784 ctx.arglen = PyTuple_Size(args);
14785 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014786 }
14787 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014788 ctx.arglen = -1;
14789 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014790 }
Victor Stinnera47082312012-10-04 02:19:54 +020014791 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014792 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014793 ctx.dict = args;
14794 else
14795 ctx.dict = NULL;
14796 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014797
Victor Stinnera47082312012-10-04 02:19:54 +020014798 while (--ctx.fmtcnt >= 0) {
14799 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014800 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014801
14802 nonfmtpos = ctx.fmtpos++;
14803 while (ctx.fmtcnt >= 0 &&
14804 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14805 ctx.fmtpos++;
14806 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 }
Victor Stinnera47082312012-10-04 02:19:54 +020014808 if (ctx.fmtcnt < 0) {
14809 ctx.fmtpos--;
14810 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014811 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014812
Victor Stinnercfc4c132013-04-03 01:48:39 +020014813 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14814 nonfmtpos, ctx.fmtpos) < 0)
14815 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014816 }
14817 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014818 ctx.fmtpos++;
14819 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014820 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014821 }
14822 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014823
Victor Stinnera47082312012-10-04 02:19:54 +020014824 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014825 PyErr_SetString(PyExc_TypeError,
14826 "not all arguments converted during string formatting");
14827 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014828 }
14829
Victor Stinnera47082312012-10-04 02:19:54 +020014830 if (ctx.args_owned) {
14831 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014832 }
Victor Stinnera47082312012-10-04 02:19:54 +020014833 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014834
Benjamin Peterson29060642009-01-31 22:14:21 +000014835 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014836 _PyUnicodeWriter_Dealloc(&ctx.writer);
14837 if (ctx.args_owned) {
14838 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014839 }
14840 return NULL;
14841}
14842
Jeremy Hylton938ace62002-07-17 16:30:39 +000014843static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014844unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14845
Tim Peters6d6c1a32001-08-02 04:15:00 +000014846static PyObject *
14847unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14848{
Benjamin Peterson29060642009-01-31 22:14:21 +000014849 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014850 static char *kwlist[] = {"object", "encoding", "errors", 0};
14851 char *encoding = NULL;
14852 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014853
Benjamin Peterson14339b62009-01-31 16:36:08 +000014854 if (type != &PyUnicode_Type)
14855 return unicode_subtype_new(type, args, kwds);
14856 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014857 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 return NULL;
14859 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014860 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014861 if (encoding == NULL && errors == NULL)
14862 return PyObject_Str(x);
14863 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014864 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014865}
14866
Guido van Rossume023fe02001-08-30 03:12:59 +000014867static PyObject *
14868unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14869{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014870 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014871 Py_ssize_t length, char_size;
14872 int share_wstr, share_utf8;
14873 unsigned int kind;
14874 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014875
Benjamin Peterson14339b62009-01-31 16:36:08 +000014876 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014877
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014878 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014879 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014880 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014881 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014882 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014883 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014884 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014885 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014886
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014887 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014888 if (self == NULL) {
14889 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014890 return NULL;
14891 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014892 kind = PyUnicode_KIND(unicode);
14893 length = PyUnicode_GET_LENGTH(unicode);
14894
14895 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014896#ifdef Py_DEBUG
14897 _PyUnicode_HASH(self) = -1;
14898#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014899 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014900#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014901 _PyUnicode_STATE(self).interned = 0;
14902 _PyUnicode_STATE(self).kind = kind;
14903 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014904 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014905 _PyUnicode_STATE(self).ready = 1;
14906 _PyUnicode_WSTR(self) = NULL;
14907 _PyUnicode_UTF8_LENGTH(self) = 0;
14908 _PyUnicode_UTF8(self) = NULL;
14909 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014910 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014911
14912 share_utf8 = 0;
14913 share_wstr = 0;
14914 if (kind == PyUnicode_1BYTE_KIND) {
14915 char_size = 1;
14916 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14917 share_utf8 = 1;
14918 }
14919 else if (kind == PyUnicode_2BYTE_KIND) {
14920 char_size = 2;
14921 if (sizeof(wchar_t) == 2)
14922 share_wstr = 1;
14923 }
14924 else {
14925 assert(kind == PyUnicode_4BYTE_KIND);
14926 char_size = 4;
14927 if (sizeof(wchar_t) == 4)
14928 share_wstr = 1;
14929 }
14930
14931 /* Ensure we won't overflow the length. */
14932 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14933 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014934 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014935 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014936 data = PyObject_MALLOC((length + 1) * char_size);
14937 if (data == NULL) {
14938 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014939 goto onError;
14940 }
14941
Victor Stinnerc3c74152011-10-02 20:39:55 +020014942 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014943 if (share_utf8) {
14944 _PyUnicode_UTF8_LENGTH(self) = length;
14945 _PyUnicode_UTF8(self) = data;
14946 }
14947 if (share_wstr) {
14948 _PyUnicode_WSTR_LENGTH(self) = length;
14949 _PyUnicode_WSTR(self) = (wchar_t *)data;
14950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014951
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014952 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014953 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014954 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014955#ifdef Py_DEBUG
14956 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14957#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014958 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014959 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014960
14961onError:
14962 Py_DECREF(unicode);
14963 Py_DECREF(self);
14964 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014965}
14966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014967PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014968"str(object='') -> str\n\
14969str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014970\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014971Create a new string object from the given object. If encoding or\n\
14972errors is specified, then the object must expose a data buffer\n\
14973that will be decoded using the given encoding and error handler.\n\
14974Otherwise, returns the result of object.__str__() (if defined)\n\
14975or repr(object).\n\
14976encoding defaults to sys.getdefaultencoding().\n\
14977errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014978
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014979static PyObject *unicode_iter(PyObject *seq);
14980
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014982 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014983 "str", /* tp_name */
14984 sizeof(PyUnicodeObject), /* tp_size */
14985 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014986 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014987 (destructor)unicode_dealloc, /* tp_dealloc */
14988 0, /* tp_print */
14989 0, /* tp_getattr */
14990 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014991 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 unicode_repr, /* tp_repr */
14993 &unicode_as_number, /* tp_as_number */
14994 &unicode_as_sequence, /* tp_as_sequence */
14995 &unicode_as_mapping, /* tp_as_mapping */
14996 (hashfunc) unicode_hash, /* tp_hash*/
14997 0, /* tp_call*/
14998 (reprfunc) unicode_str, /* tp_str */
14999 PyObject_GenericGetAttr, /* tp_getattro */
15000 0, /* tp_setattro */
15001 0, /* tp_as_buffer */
15002 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015003 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015004 unicode_doc, /* tp_doc */
15005 0, /* tp_traverse */
15006 0, /* tp_clear */
15007 PyUnicode_RichCompare, /* tp_richcompare */
15008 0, /* tp_weaklistoffset */
15009 unicode_iter, /* tp_iter */
15010 0, /* tp_iternext */
15011 unicode_methods, /* tp_methods */
15012 0, /* tp_members */
15013 0, /* tp_getset */
15014 &PyBaseObject_Type, /* tp_base */
15015 0, /* tp_dict */
15016 0, /* tp_descr_get */
15017 0, /* tp_descr_set */
15018 0, /* tp_dictoffset */
15019 0, /* tp_init */
15020 0, /* tp_alloc */
15021 unicode_new, /* tp_new */
15022 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023};
15024
15025/* Initialize the Unicode implementation */
15026
Victor Stinner3a50e702011-10-18 21:21:00 +020015027int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015028{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015029 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015030 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015031 0x000A, /* LINE FEED */
15032 0x000D, /* CARRIAGE RETURN */
15033 0x001C, /* FILE SEPARATOR */
15034 0x001D, /* GROUP SEPARATOR */
15035 0x001E, /* RECORD SEPARATOR */
15036 0x0085, /* NEXT LINE */
15037 0x2028, /* LINE SEPARATOR */
15038 0x2029, /* PARAGRAPH SEPARATOR */
15039 };
15040
Fred Drakee4315f52000-05-09 19:53:39 +000015041 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015042 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015043 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015044 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015045 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015046
Guido van Rossumcacfc072002-05-24 19:01:59 +000015047 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015048 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015049
15050 /* initialize the linebreak bloom filter */
15051 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015052 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015053 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015054
Christian Heimes26532f72013-07-20 14:57:16 +020015055 if (PyType_Ready(&EncodingMapType) < 0)
15056 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015057
Benjamin Petersonc4311282012-10-30 23:21:10 -040015058 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15059 Py_FatalError("Can't initialize field name iterator type");
15060
15061 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15062 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015063
Victor Stinner3a50e702011-10-18 21:21:00 +020015064 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015065}
15066
15067/* Finalize the Unicode implementation */
15068
Christian Heimesa156e092008-02-16 07:38:31 +000015069int
15070PyUnicode_ClearFreeList(void)
15071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015072 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015073}
15074
Guido van Rossumd57fd912000-03-10 22:53:23 +000015075void
Thomas Wouters78890102000-07-22 19:25:51 +000015076_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015077{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015078 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015079
Serhiy Storchaka05997252013-01-26 12:14:02 +020015080 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015081
Serhiy Storchaka05997252013-01-26 12:14:02 +020015082 for (i = 0; i < 256; i++)
15083 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015084 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015085 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015086}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015087
Walter Dörwald16807132007-05-25 13:52:07 +000015088void
15089PyUnicode_InternInPlace(PyObject **p)
15090{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015091 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015093#ifdef Py_DEBUG
15094 assert(s != NULL);
15095 assert(_PyUnicode_CHECK(s));
15096#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015097 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015098 return;
15099#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 /* If it's a subclass, we don't really know what putting
15101 it in the interned dict might do. */
15102 if (!PyUnicode_CheckExact(s))
15103 return;
15104 if (PyUnicode_CHECK_INTERNED(s))
15105 return;
15106 if (interned == NULL) {
15107 interned = PyDict_New();
15108 if (interned == NULL) {
15109 PyErr_Clear(); /* Don't leave an exception */
15110 return;
15111 }
15112 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015114 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015116 if (t == NULL) {
15117 PyErr_Clear();
15118 return;
15119 }
15120 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015121 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015122 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015123 return;
15124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 /* The two references in interned are not counted by refcnt.
15126 The deallocator will take care of this */
15127 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015128 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015129}
15130
15131void
15132PyUnicode_InternImmortal(PyObject **p)
15133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 PyUnicode_InternInPlace(p);
15135 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015136 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 Py_INCREF(*p);
15138 }
Walter Dörwald16807132007-05-25 13:52:07 +000015139}
15140
15141PyObject *
15142PyUnicode_InternFromString(const char *cp)
15143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 PyObject *s = PyUnicode_FromString(cp);
15145 if (s == NULL)
15146 return NULL;
15147 PyUnicode_InternInPlace(&s);
15148 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015149}
15150
Alexander Belopolsky40018472011-02-26 01:02:56 +000015151void
15152_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015153{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015154 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015155 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 Py_ssize_t i, n;
15157 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015158
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 if (interned == NULL || !PyDict_Check(interned))
15160 return;
15161 keys = PyDict_Keys(interned);
15162 if (keys == NULL || !PyList_Check(keys)) {
15163 PyErr_Clear();
15164 return;
15165 }
Walter Dörwald16807132007-05-25 13:52:07 +000015166
Benjamin Peterson14339b62009-01-31 16:36:08 +000015167 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15168 detector, interned unicode strings are not forcibly deallocated;
15169 rather, we give them their stolen references back, and then clear
15170 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015171
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 n = PyList_GET_SIZE(keys);
15173 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015174 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015175 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015176 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015177 if (PyUnicode_READY(s) == -1) {
15178 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015179 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015181 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 case SSTATE_NOT_INTERNED:
15183 /* XXX Shouldn't happen */
15184 break;
15185 case SSTATE_INTERNED_IMMORTAL:
15186 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015187 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 break;
15189 case SSTATE_INTERNED_MORTAL:
15190 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015191 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015192 break;
15193 default:
15194 Py_FatalError("Inconsistent interned string state.");
15195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015196 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015197 }
15198 fprintf(stderr, "total size of all interned strings: "
15199 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15200 "mortal/immortal\n", mortal_size, immortal_size);
15201 Py_DECREF(keys);
15202 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015203 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015204}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015205
15206
15207/********************* Unicode Iterator **************************/
15208
15209typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015210 PyObject_HEAD
15211 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015212 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015213} unicodeiterobject;
15214
15215static void
15216unicodeiter_dealloc(unicodeiterobject *it)
15217{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015218 _PyObject_GC_UNTRACK(it);
15219 Py_XDECREF(it->it_seq);
15220 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015221}
15222
15223static int
15224unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15225{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015226 Py_VISIT(it->it_seq);
15227 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015228}
15229
15230static PyObject *
15231unicodeiter_next(unicodeiterobject *it)
15232{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015233 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015234
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 assert(it != NULL);
15236 seq = it->it_seq;
15237 if (seq == NULL)
15238 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015239 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015241 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15242 int kind = PyUnicode_KIND(seq);
15243 void *data = PyUnicode_DATA(seq);
15244 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15245 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 if (item != NULL)
15247 ++it->it_index;
15248 return item;
15249 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015250
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015252 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015254}
15255
15256static PyObject *
15257unicodeiter_len(unicodeiterobject *it)
15258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015259 Py_ssize_t len = 0;
15260 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015261 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015263}
15264
15265PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15266
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015267static PyObject *
15268unicodeiter_reduce(unicodeiterobject *it)
15269{
15270 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015271 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015272 it->it_seq, it->it_index);
15273 } else {
15274 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15275 if (u == NULL)
15276 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015277 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015278 }
15279}
15280
15281PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15282
15283static PyObject *
15284unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15285{
15286 Py_ssize_t index = PyLong_AsSsize_t(state);
15287 if (index == -1 && PyErr_Occurred())
15288 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015289 if (it->it_seq != NULL) {
15290 if (index < 0)
15291 index = 0;
15292 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15293 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15294 it->it_index = index;
15295 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015296 Py_RETURN_NONE;
15297}
15298
15299PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15300
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015301static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015303 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015304 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15305 reduce_doc},
15306 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15307 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015309};
15310
15311PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15313 "str_iterator", /* tp_name */
15314 sizeof(unicodeiterobject), /* tp_basicsize */
15315 0, /* tp_itemsize */
15316 /* methods */
15317 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15318 0, /* tp_print */
15319 0, /* tp_getattr */
15320 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015321 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 0, /* tp_repr */
15323 0, /* tp_as_number */
15324 0, /* tp_as_sequence */
15325 0, /* tp_as_mapping */
15326 0, /* tp_hash */
15327 0, /* tp_call */
15328 0, /* tp_str */
15329 PyObject_GenericGetAttr, /* tp_getattro */
15330 0, /* tp_setattro */
15331 0, /* tp_as_buffer */
15332 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15333 0, /* tp_doc */
15334 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15335 0, /* tp_clear */
15336 0, /* tp_richcompare */
15337 0, /* tp_weaklistoffset */
15338 PyObject_SelfIter, /* tp_iter */
15339 (iternextfunc)unicodeiter_next, /* tp_iternext */
15340 unicodeiter_methods, /* tp_methods */
15341 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015342};
15343
15344static PyObject *
15345unicode_iter(PyObject *seq)
15346{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015348
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 if (!PyUnicode_Check(seq)) {
15350 PyErr_BadInternalCall();
15351 return NULL;
15352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015353 if (PyUnicode_READY(seq) == -1)
15354 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15356 if (it == NULL)
15357 return NULL;
15358 it->it_index = 0;
15359 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015360 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 _PyObject_GC_TRACK(it);
15362 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015363}
15364
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015365
15366size_t
15367Py_UNICODE_strlen(const Py_UNICODE *u)
15368{
15369 int res = 0;
15370 while(*u++)
15371 res++;
15372 return res;
15373}
15374
15375Py_UNICODE*
15376Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15377{
15378 Py_UNICODE *u = s1;
15379 while ((*u++ = *s2++));
15380 return s1;
15381}
15382
15383Py_UNICODE*
15384Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15385{
15386 Py_UNICODE *u = s1;
15387 while ((*u++ = *s2++))
15388 if (n-- == 0)
15389 break;
15390 return s1;
15391}
15392
15393Py_UNICODE*
15394Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15395{
15396 Py_UNICODE *u1 = s1;
15397 u1 += Py_UNICODE_strlen(u1);
15398 Py_UNICODE_strcpy(u1, s2);
15399 return s1;
15400}
15401
15402int
15403Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15404{
15405 while (*s1 && *s2 && *s1 == *s2)
15406 s1++, s2++;
15407 if (*s1 && *s2)
15408 return (*s1 < *s2) ? -1 : +1;
15409 if (*s1)
15410 return 1;
15411 if (*s2)
15412 return -1;
15413 return 0;
15414}
15415
15416int
15417Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15418{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015419 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015420 for (; n != 0; n--) {
15421 u1 = *s1;
15422 u2 = *s2;
15423 if (u1 != u2)
15424 return (u1 < u2) ? -1 : +1;
15425 if (u1 == '\0')
15426 return 0;
15427 s1++;
15428 s2++;
15429 }
15430 return 0;
15431}
15432
15433Py_UNICODE*
15434Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15435{
15436 const Py_UNICODE *p;
15437 for (p = s; *p; p++)
15438 if (*p == c)
15439 return (Py_UNICODE*)p;
15440 return NULL;
15441}
15442
15443Py_UNICODE*
15444Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15445{
15446 const Py_UNICODE *p;
15447 p = s + Py_UNICODE_strlen(s);
15448 while (p != s) {
15449 p--;
15450 if (*p == c)
15451 return (Py_UNICODE*)p;
15452 }
15453 return NULL;
15454}
Victor Stinner331ea922010-08-10 16:37:20 +000015455
Victor Stinner71133ff2010-09-01 23:43:53 +000015456Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015457PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015458{
Victor Stinner577db2c2011-10-11 22:12:48 +020015459 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015460 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015462 if (!PyUnicode_Check(unicode)) {
15463 PyErr_BadArgument();
15464 return NULL;
15465 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015466 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015467 if (u == NULL)
15468 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015469 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015470 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015471 PyErr_NoMemory();
15472 return NULL;
15473 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015474 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015475 size *= sizeof(Py_UNICODE);
15476 copy = PyMem_Malloc(size);
15477 if (copy == NULL) {
15478 PyErr_NoMemory();
15479 return NULL;
15480 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015481 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015482 return copy;
15483}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015484
Georg Brandl66c221e2010-10-14 07:04:07 +000015485/* A _string module, to export formatter_parser and formatter_field_name_split
15486 to the string.Formatter class implemented in Python. */
15487
15488static PyMethodDef _string_methods[] = {
15489 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15490 METH_O, PyDoc_STR("split the argument as a field name")},
15491 {"formatter_parser", (PyCFunction) formatter_parser,
15492 METH_O, PyDoc_STR("parse the argument as a format string")},
15493 {NULL, NULL}
15494};
15495
15496static struct PyModuleDef _string_module = {
15497 PyModuleDef_HEAD_INIT,
15498 "_string",
15499 PyDoc_STR("string helper module"),
15500 0,
15501 _string_methods,
15502 NULL,
15503 NULL,
15504 NULL,
15505 NULL
15506};
15507
15508PyMODINIT_FUNC
15509PyInit__string(void)
15510{
15511 return PyModule_Create(&_string_module);
15512}
15513
15514
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015515#ifdef __cplusplus
15516}
15517#endif