blob: 0bcacd834fb783eb75c144c725bd9ffc20ceff64 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
275static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
321 if (strcmp(errors, "surrogateescape") == 0)
322 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner50149202015-09-22 00:26:54 +0200323 if (strcmp(errors, "replace") == 0)
324 return _Py_ERROR_REPLACE;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200325 if (strcmp(errors, "ignore") == 0)
326 return _Py_ERROR_IGNORE;
327 if (strcmp(errors, "backslashreplace") == 0)
328 return _Py_ERROR_BACKSLASHREPLACE;
329 if (strcmp(errors, "surrogatepass") == 0)
330 return _Py_ERROR_SURROGATEPASS;
Victor Stinner50149202015-09-22 00:26:54 +0200331 if (strcmp(errors, "xmlcharrefreplace") == 0)
332 return _Py_ERROR_XMLCHARREFREPLACE;
333 return _Py_ERROR_OTHER;
334}
335
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300336/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
337 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000338Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000339PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000340{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000341#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000343#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 /* This is actually an illegal character, so it should
345 not be passed to unichr. */
346 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347#endif
348}
349
Victor Stinner910337b2011-10-03 03:20:16 +0200350#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200351int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100352_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200353{
354 PyASCIIObject *ascii;
355 unsigned int kind;
356
357 assert(PyUnicode_Check(op));
358
359 ascii = (PyASCIIObject *)op;
360 kind = ascii->state.kind;
361
Victor Stinnera3b334d2011-10-03 13:53:37 +0200362 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200363 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200364 assert(ascii->state.ready == 1);
365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200367 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200369
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 if (ascii->state.compact == 1) {
371 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200372 assert(kind == PyUnicode_1BYTE_KIND
373 || kind == PyUnicode_2BYTE_KIND
374 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100378 }
379 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
381
382 data = unicode->data.any;
383 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100384 assert(ascii->length == 0);
385 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 assert(ascii->state.compact == 0);
387 assert(ascii->state.ascii == 0);
388 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100389 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 assert(ascii->wstr != NULL);
391 assert(data == NULL);
392 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 }
394 else {
395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ready == 1);
400 assert(data != NULL);
401 if (ascii->state.ascii) {
402 assert (compact->utf8 == data);
403 assert (compact->utf8_length == ascii->length);
404 }
405 else
406 assert (compact->utf8 != data);
407 }
408 }
409 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200410 if (
411#if SIZEOF_WCHAR_T == 2
412 kind == PyUnicode_2BYTE_KIND
413#else
414 kind == PyUnicode_4BYTE_KIND
415#endif
416 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 {
418 assert(ascii->wstr == data);
419 assert(compact->wstr_length == ascii->length);
420 } else
421 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423
424 if (compact->utf8 == NULL)
425 assert(compact->utf8_length == 0);
426 if (ascii->wstr == NULL)
427 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200428 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200429 /* check that the best kind is used */
430 if (check_content && kind != PyUnicode_WCHAR_KIND)
431 {
432 Py_ssize_t i;
433 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200434 void *data;
435 Py_UCS4 ch;
436
437 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200438 for (i=0; i < ascii->length; i++)
439 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200440 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 if (ch > maxchar)
442 maxchar = ch;
443 }
444 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100445 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200446 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100447 assert(maxchar <= 255);
448 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200449 else
450 assert(maxchar < 128);
451 }
Victor Stinner77faf692011-11-20 18:56:05 +0100452 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 0xFFFF);
455 }
456 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100458 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200460 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400462 return 1;
463}
Victor Stinner910337b2011-10-03 03:20:16 +0200464#endif
465
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466static PyObject*
467unicode_result_wchar(PyObject *unicode)
468{
469#ifndef Py_DEBUG
470 Py_ssize_t len;
471
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 len = _PyUnicode_WSTR_LENGTH(unicode);
473 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200475 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 }
477
478 if (len == 1) {
479 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100480 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
482 Py_DECREF(unicode);
483 return latin1_char;
484 }
485 }
486
487 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200488 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489 return NULL;
490 }
491#else
Victor Stinneraa771272012-10-04 02:32:58 +0200492 assert(Py_REFCNT(unicode) == 1);
493
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 /* don't make the result ready in debug mode to ensure that the caller
495 makes the string ready before using it */
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497#endif
498 return unicode;
499}
500
501static PyObject*
502unicode_result_ready(PyObject *unicode)
503{
504 Py_ssize_t length;
505
506 length = PyUnicode_GET_LENGTH(unicode);
507 if (length == 0) {
508 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100509 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100511 }
512 return unicode_empty;
513 }
514
515 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200516 void *data = PyUnicode_DATA(unicode);
517 int kind = PyUnicode_KIND(unicode);
518 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519 if (ch < 256) {
520 PyObject *latin1_char = unicode_latin1[ch];
521 if (latin1_char != NULL) {
522 if (unicode != latin1_char) {
523 Py_INCREF(latin1_char);
524 Py_DECREF(unicode);
525 }
526 return latin1_char;
527 }
528 else {
529 assert(_PyUnicode_CheckConsistency(unicode, 1));
530 Py_INCREF(unicode);
531 unicode_latin1[ch] = unicode;
532 return unicode;
533 }
534 }
535 }
536
537 assert(_PyUnicode_CheckConsistency(unicode, 1));
538 return unicode;
539}
540
541static PyObject*
542unicode_result(PyObject *unicode)
543{
544 assert(_PyUnicode_CHECK(unicode));
545 if (PyUnicode_IS_READY(unicode))
546 return unicode_result_ready(unicode);
547 else
548 return unicode_result_wchar(unicode);
549}
550
Victor Stinnerc4b49542011-12-11 22:44:26 +0100551static PyObject*
552unicode_result_unchanged(PyObject *unicode)
553{
554 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500555 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100556 return NULL;
557 Py_INCREF(unicode);
558 return unicode;
559 }
560 else
561 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100562 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563}
564
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200565/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
566 ASCII, Latin1, UTF-8, etc. */
567static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200568backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200569 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
570{
Victor Stinnerad771582015-10-09 12:38:53 +0200571 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572 Py_UCS4 ch;
573 enum PyUnicode_Kind kind;
574 void *data;
575
576 assert(PyUnicode_IS_READY(unicode));
577 kind = PyUnicode_KIND(unicode);
578 data = PyUnicode_DATA(unicode);
579
580 size = 0;
581 /* determine replacement size */
582 for (i = collstart; i < collend; ++i) {
583 Py_ssize_t incr;
584
585 ch = PyUnicode_READ(kind, data, i);
586 if (ch < 0x100)
587 incr = 2+2;
588 else if (ch < 0x10000)
589 incr = 2+4;
590 else {
591 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200592 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 }
594 if (size > PY_SSIZE_T_MAX - incr) {
595 PyErr_SetString(PyExc_OverflowError,
596 "encoded result is too long for a Python string");
597 return NULL;
598 }
599 size += incr;
600 }
601
Victor Stinnerad771582015-10-09 12:38:53 +0200602 str = _PyBytesWriter_Prepare(writer, str, size);
603 if (str == NULL)
604 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200605
606 /* generate replacement */
607 for (i = collstart; i < collend; ++i) {
608 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200609 *str++ = '\\';
610 if (ch >= 0x00010000) {
611 *str++ = 'U';
612 *str++ = Py_hexdigits[(ch>>28)&0xf];
613 *str++ = Py_hexdigits[(ch>>24)&0xf];
614 *str++ = Py_hexdigits[(ch>>20)&0xf];
615 *str++ = Py_hexdigits[(ch>>16)&0xf];
616 *str++ = Py_hexdigits[(ch>>12)&0xf];
617 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618 }
Victor Stinner797485e2015-10-09 03:17:30 +0200619 else if (ch >= 0x100) {
620 *str++ = 'u';
621 *str++ = Py_hexdigits[(ch>>12)&0xf];
622 *str++ = Py_hexdigits[(ch>>8)&0xf];
623 }
624 else
625 *str++ = 'x';
626 *str++ = Py_hexdigits[(ch>>4)&0xf];
627 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628 }
629 return str;
630}
631
632/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
633 ASCII, Latin1, UTF-8, etc. */
634static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200635xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200636 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
637{
Victor Stinnerad771582015-10-09 12:38:53 +0200638 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200639 Py_UCS4 ch;
640 enum PyUnicode_Kind kind;
641 void *data;
642
643 assert(PyUnicode_IS_READY(unicode));
644 kind = PyUnicode_KIND(unicode);
645 data = PyUnicode_DATA(unicode);
646
647 size = 0;
648 /* determine replacement size */
649 for (i = collstart; i < collend; ++i) {
650 Py_ssize_t incr;
651
652 ch = PyUnicode_READ(kind, data, i);
653 if (ch < 10)
654 incr = 2+1+1;
655 else if (ch < 100)
656 incr = 2+2+1;
657 else if (ch < 1000)
658 incr = 2+3+1;
659 else if (ch < 10000)
660 incr = 2+4+1;
661 else if (ch < 100000)
662 incr = 2+5+1;
663 else if (ch < 1000000)
664 incr = 2+6+1;
665 else {
666 assert(ch <= MAX_UNICODE);
667 incr = 2+7+1;
668 }
669 if (size > PY_SSIZE_T_MAX - incr) {
670 PyErr_SetString(PyExc_OverflowError,
671 "encoded result is too long for a Python string");
672 return NULL;
673 }
674 size += incr;
675 }
676
Victor Stinnerad771582015-10-09 12:38:53 +0200677 str = _PyBytesWriter_Prepare(writer, str, size);
678 if (str == NULL)
679 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200680
681 /* generate replacement */
682 for (i = collstart; i < collend; ++i) {
683 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
684 }
685 return str;
686}
687
Thomas Wouters477c8d52006-05-27 19:21:47 +0000688/* --- Bloom Filters ----------------------------------------------------- */
689
690/* stuff to implement simple "bloom filters" for Unicode characters.
691 to keep things simple, we use a single bitmask, using the least 5
692 bits from each unicode characters as the bit index. */
693
694/* the linebreak mask is set up by Unicode_Init below */
695
Antoine Pitrouf068f942010-01-13 14:19:12 +0000696#if LONG_BIT >= 128
697#define BLOOM_WIDTH 128
698#elif LONG_BIT >= 64
699#define BLOOM_WIDTH 64
700#elif LONG_BIT >= 32
701#define BLOOM_WIDTH 32
702#else
703#error "LONG_BIT is smaller than 32"
704#endif
705
Thomas Wouters477c8d52006-05-27 19:21:47 +0000706#define BLOOM_MASK unsigned long
707
Serhiy Storchaka05997252013-01-26 12:14:02 +0200708static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000709
Antoine Pitrouf068f942010-01-13 14:19:12 +0000710#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711
Benjamin Peterson29060642009-01-31 22:14:21 +0000712#define BLOOM_LINEBREAK(ch) \
713 ((ch) < 128U ? ascii_linebreak[(ch)] : \
714 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000715
Alexander Belopolsky40018472011-02-26 01:02:56 +0000716Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718{
Victor Stinnera85af502013-04-09 21:53:54 +0200719#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
720 do { \
721 TYPE *data = (TYPE *)PTR; \
722 TYPE *end = data + LEN; \
723 Py_UCS4 ch; \
724 for (; data != end; data++) { \
725 ch = *data; \
726 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
727 } \
728 break; \
729 } while (0)
730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* calculate simple bloom-style bitmask for a given unicode string */
732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
735 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200736 switch (kind) {
737 case PyUnicode_1BYTE_KIND:
738 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
739 break;
740 case PyUnicode_2BYTE_KIND:
741 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
742 break;
743 case PyUnicode_4BYTE_KIND:
744 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
745 break;
746 default:
747 assert(0);
748 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200750
751#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752}
753
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200754/* Compilation of templated routines */
755
756#include "stringlib/asciilib.h"
757#include "stringlib/fastsearch.h"
758#include "stringlib/partition.h"
759#include "stringlib/split.h"
760#include "stringlib/count.h"
761#include "stringlib/find.h"
762#include "stringlib/find_max_char.h"
763#include "stringlib/localeutil.h"
764#include "stringlib/undef.h"
765
766#include "stringlib/ucs1lib.h"
767#include "stringlib/fastsearch.h"
768#include "stringlib/partition.h"
769#include "stringlib/split.h"
770#include "stringlib/count.h"
771#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300772#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773#include "stringlib/find_max_char.h"
774#include "stringlib/localeutil.h"
775#include "stringlib/undef.h"
776
777#include "stringlib/ucs2lib.h"
778#include "stringlib/fastsearch.h"
779#include "stringlib/partition.h"
780#include "stringlib/split.h"
781#include "stringlib/count.h"
782#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300783#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200784#include "stringlib/find_max_char.h"
785#include "stringlib/localeutil.h"
786#include "stringlib/undef.h"
787
788#include "stringlib/ucs4lib.h"
789#include "stringlib/fastsearch.h"
790#include "stringlib/partition.h"
791#include "stringlib/split.h"
792#include "stringlib/count.h"
793#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300794#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200795#include "stringlib/find_max_char.h"
796#include "stringlib/localeutil.h"
797#include "stringlib/undef.h"
798
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200799#include "stringlib/unicodedefs.h"
800#include "stringlib/fastsearch.h"
801#include "stringlib/count.h"
802#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100803#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200804
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805/* --- Unicode Object ----------------------------------------------------- */
806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200808fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200810Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200811 Py_ssize_t size, Py_UCS4 ch,
812 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200814 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
815
816 switch (kind) {
817 case PyUnicode_1BYTE_KIND:
818 {
819 Py_UCS1 ch1 = (Py_UCS1) ch;
820 if (ch1 == ch)
821 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
822 else
823 return -1;
824 }
825 case PyUnicode_2BYTE_KIND:
826 {
827 Py_UCS2 ch2 = (Py_UCS2) ch;
828 if (ch2 == ch)
829 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
830 else
831 return -1;
832 }
833 case PyUnicode_4BYTE_KIND:
834 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
835 default:
836 assert(0);
837 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839}
840
Victor Stinnerafffce42012-10-03 23:03:17 +0200841#ifdef Py_DEBUG
842/* Fill the data of an Unicode string with invalid characters to detect bugs
843 earlier.
844
845 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
846 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
847 invalid character in Unicode 6.0. */
848static void
849unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
850{
851 int kind = PyUnicode_KIND(unicode);
852 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
853 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
854 if (length <= old_length)
855 return;
856 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
857}
858#endif
859
Victor Stinnerfe226c02011-10-03 03:52:20 +0200860static PyObject*
861resize_compact(PyObject *unicode, Py_ssize_t length)
862{
863 Py_ssize_t char_size;
864 Py_ssize_t struct_size;
865 Py_ssize_t new_size;
866 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100867 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200868#ifdef Py_DEBUG
869 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
870#endif
871
Victor Stinner79891572012-05-03 13:43:07 +0200872 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200873 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100874 assert(PyUnicode_IS_COMPACT(unicode));
875
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200876 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878 struct_size = sizeof(PyASCIIObject);
879 else
880 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200881 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200882
Victor Stinnerfe226c02011-10-03 03:52:20 +0200883 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
884 PyErr_NoMemory();
885 return NULL;
886 }
887 new_size = (struct_size + (length + 1) * char_size);
888
Victor Stinner84def372011-12-11 20:04:56 +0100889 _Py_DEC_REFTOTAL;
890 _Py_ForgetReference(unicode);
891
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300892 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100893 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100894 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895 PyErr_NoMemory();
896 return NULL;
897 }
Victor Stinner84def372011-12-11 20:04:56 +0100898 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200899 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200902 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200903 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100904 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100907 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_WSTR(unicode));
909 _PyUnicode_WSTR(unicode) = NULL;
910 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200911#ifdef Py_DEBUG
912 unicode_fill_invalid(unicode, old_length);
913#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200914 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
915 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200916 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917 return unicode;
918}
919
Alexander Belopolsky40018472011-02-26 01:02:56 +0000920static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200921resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000922{
Victor Stinner95663112011-10-04 01:03:50 +0200923 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100924 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000927
Victor Stinnerfe226c02011-10-03 03:52:20 +0200928 if (PyUnicode_IS_READY(unicode)) {
929 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200930 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200931 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200932#ifdef Py_DEBUG
933 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
934#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935
936 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200937 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
939 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940
941 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
942 PyErr_NoMemory();
943 return -1;
944 }
945 new_size = (length + 1) * char_size;
946
Victor Stinner7a9105a2011-12-12 00:13:42 +0100947 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
948 {
949 PyObject_DEL(_PyUnicode_UTF8(unicode));
950 _PyUnicode_UTF8(unicode) = NULL;
951 _PyUnicode_UTF8_LENGTH(unicode) = 0;
952 }
953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 data = (PyObject *)PyObject_REALLOC(data, new_size);
955 if (data == NULL) {
956 PyErr_NoMemory();
957 return -1;
958 }
959 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200960 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200962 _PyUnicode_WSTR_LENGTH(unicode) = length;
963 }
964 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200965 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200966 _PyUnicode_UTF8_LENGTH(unicode) = length;
967 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 _PyUnicode_LENGTH(unicode) = length;
969 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200970#ifdef Py_DEBUG
971 unicode_fill_invalid(unicode, old_length);
972#endif
Victor Stinner95663112011-10-04 01:03:50 +0200973 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200974 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200975 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200976 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977 }
Victor Stinner95663112011-10-04 01:03:50 +0200978 assert(_PyUnicode_WSTR(unicode) != NULL);
979
980 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700981 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200982 PyErr_NoMemory();
983 return -1;
984 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100985 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200986 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100987 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200988 if (!wstr) {
989 PyErr_NoMemory();
990 return -1;
991 }
992 _PyUnicode_WSTR(unicode) = wstr;
993 _PyUnicode_WSTR(unicode)[length] = 0;
994 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200995 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996 return 0;
997}
998
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999static PyObject*
1000resize_copy(PyObject *unicode, Py_ssize_t length)
1001{
1002 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001003 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001005
Benjamin Petersonbac79492012-01-14 13:34:47 -05001006 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001007 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008
1009 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1010 if (copy == NULL)
1011 return NULL;
1012
1013 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001014 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001015 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001016 }
1017 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001018 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001019
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001020 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021 if (w == NULL)
1022 return NULL;
1023 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1024 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001025 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1026 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001027 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 }
1029}
1030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001032 Ux0000 terminated; some code (e.g. new_identifier)
1033 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034
1035 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001036 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037
1038*/
1039
Alexander Belopolsky40018472011-02-26 01:02:56 +00001040static PyUnicodeObject *
1041_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001043 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
Thomas Wouters477c8d52006-05-27 19:21:47 +00001046 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (length == 0 && unicode_empty != NULL) {
1048 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001049 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050 }
1051
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001052 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001053 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001054 return (PyUnicodeObject *)PyErr_NoMemory();
1055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 if (length < 0) {
1057 PyErr_SetString(PyExc_SystemError,
1058 "Negative size passed to _PyUnicode_New");
1059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 }
1061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1063 if (unicode == NULL)
1064 return NULL;
1065 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001066
1067 _PyUnicode_WSTR_LENGTH(unicode) = length;
1068 _PyUnicode_HASH(unicode) = -1;
1069 _PyUnicode_STATE(unicode).interned = 0;
1070 _PyUnicode_STATE(unicode).kind = 0;
1071 _PyUnicode_STATE(unicode).compact = 0;
1072 _PyUnicode_STATE(unicode).ready = 0;
1073 _PyUnicode_STATE(unicode).ascii = 0;
1074 _PyUnicode_DATA_ANY(unicode) = NULL;
1075 _PyUnicode_LENGTH(unicode) = 0;
1076 _PyUnicode_UTF8(unicode) = NULL;
1077 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1080 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001081 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001083 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085
Jeremy Hyltond8082792003-09-16 19:41:39 +00001086 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001087 * the caller fails before initializing str -- unicode_resize()
1088 * reads str[0], and the Keep-Alive optimization can keep memory
1089 * allocated for str alive across a call to unicode_dealloc(unicode).
1090 * We don't want unicode_resize to read uninitialized memory in
1091 * that case.
1092 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 _PyUnicode_WSTR(unicode)[0] = 0;
1094 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001095
Victor Stinner7931d9a2011-11-04 00:22:48 +01001096 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 return unicode;
1098}
1099
Victor Stinnerf42dc442011-10-02 23:33:16 +02001100static const char*
1101unicode_kind_name(PyObject *unicode)
1102{
Victor Stinner42dfd712011-10-03 14:41:45 +02001103 /* don't check consistency: unicode_kind_name() is called from
1104 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001105 if (!PyUnicode_IS_COMPACT(unicode))
1106 {
1107 if (!PyUnicode_IS_READY(unicode))
1108 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001109 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001110 {
1111 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001112 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001113 return "legacy ascii";
1114 else
1115 return "legacy latin1";
1116 case PyUnicode_2BYTE_KIND:
1117 return "legacy UCS2";
1118 case PyUnicode_4BYTE_KIND:
1119 return "legacy UCS4";
1120 default:
1121 return "<legacy invalid kind>";
1122 }
1123 }
1124 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001125 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001127 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 return "ascii";
1129 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001130 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001132 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001133 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001134 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 default:
1136 return "<invalid compact kind>";
1137 }
1138}
1139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141/* Functions wrapping macros for use in debugger */
1142char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001143 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144}
1145
1146void *_PyUnicode_compact_data(void *unicode) {
1147 return _PyUnicode_COMPACT_DATA(unicode);
1148}
1149void *_PyUnicode_data(void *unicode){
1150 printf("obj %p\n", unicode);
1151 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1152 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1153 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1154 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1155 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1156 return PyUnicode_DATA(unicode);
1157}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001158
1159void
1160_PyUnicode_Dump(PyObject *op)
1161{
1162 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001163 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1164 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1165 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001166
Victor Stinnera849a4b2011-10-03 12:12:11 +02001167 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001168 {
1169 if (ascii->state.ascii)
1170 data = (ascii + 1);
1171 else
1172 data = (compact + 1);
1173 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001174 else
1175 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001176 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1177 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001178
Victor Stinnera849a4b2011-10-03 12:12:11 +02001179 if (ascii->wstr == data)
1180 printf("shared ");
1181 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001182
Victor Stinnera3b334d2011-10-03 13:53:37 +02001183 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001184 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001185 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1186 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001187 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1188 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001189 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001190 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001191}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192#endif
1193
1194PyObject *
1195PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1196{
1197 PyObject *obj;
1198 PyCompactUnicodeObject *unicode;
1199 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001200 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001201 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202 Py_ssize_t char_size;
1203 Py_ssize_t struct_size;
1204
1205 /* Optimization for empty strings */
1206 if (size == 0 && unicode_empty != NULL) {
1207 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001208 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 }
1210
Victor Stinner9e9d6892011-10-04 01:02:02 +02001211 is_ascii = 0;
1212 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 struct_size = sizeof(PyCompactUnicodeObject);
1214 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001215 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 char_size = 1;
1217 is_ascii = 1;
1218 struct_size = sizeof(PyASCIIObject);
1219 }
1220 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001221 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 char_size = 1;
1223 }
1224 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001225 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 char_size = 2;
1227 if (sizeof(wchar_t) == 2)
1228 is_sharing = 1;
1229 }
1230 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001231 if (maxchar > MAX_UNICODE) {
1232 PyErr_SetString(PyExc_SystemError,
1233 "invalid maximum character passed to PyUnicode_New");
1234 return NULL;
1235 }
Victor Stinner8f825062012-04-27 13:55:39 +02001236 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 char_size = 4;
1238 if (sizeof(wchar_t) == 4)
1239 is_sharing = 1;
1240 }
1241
1242 /* Ensure we won't overflow the size. */
1243 if (size < 0) {
1244 PyErr_SetString(PyExc_SystemError,
1245 "Negative size passed to PyUnicode_New");
1246 return NULL;
1247 }
1248 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1249 return PyErr_NoMemory();
1250
1251 /* Duplicated allocation code from _PyObject_New() instead of a call to
1252 * PyObject_New() so we are able to allocate space for the object and
1253 * it's data buffer.
1254 */
1255 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1256 if (obj == NULL)
1257 return PyErr_NoMemory();
1258 obj = PyObject_INIT(obj, &PyUnicode_Type);
1259 if (obj == NULL)
1260 return NULL;
1261
1262 unicode = (PyCompactUnicodeObject *)obj;
1263 if (is_ascii)
1264 data = ((PyASCIIObject*)obj) + 1;
1265 else
1266 data = unicode + 1;
1267 _PyUnicode_LENGTH(unicode) = size;
1268 _PyUnicode_HASH(unicode) = -1;
1269 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001270 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271 _PyUnicode_STATE(unicode).compact = 1;
1272 _PyUnicode_STATE(unicode).ready = 1;
1273 _PyUnicode_STATE(unicode).ascii = is_ascii;
1274 if (is_ascii) {
1275 ((char*)data)[size] = 0;
1276 _PyUnicode_WSTR(unicode) = NULL;
1277 }
Victor Stinner8f825062012-04-27 13:55:39 +02001278 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 ((char*)data)[size] = 0;
1280 _PyUnicode_WSTR(unicode) = NULL;
1281 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001283 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285 else {
1286 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001287 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001288 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001290 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291 ((Py_UCS4*)data)[size] = 0;
1292 if (is_sharing) {
1293 _PyUnicode_WSTR_LENGTH(unicode) = size;
1294 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1295 }
1296 else {
1297 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1298 _PyUnicode_WSTR(unicode) = NULL;
1299 }
1300 }
Victor Stinner8f825062012-04-27 13:55:39 +02001301#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001302 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001303#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001304 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return obj;
1306}
1307
1308#if SIZEOF_WCHAR_T == 2
1309/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1310 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001311 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312
1313 This function assumes that unicode can hold one more code point than wstr
1314 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001315static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001317 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318{
1319 const wchar_t *iter;
1320 Py_UCS4 *ucs4_out;
1321
Victor Stinner910337b2011-10-03 03:20:16 +02001322 assert(unicode != NULL);
1323 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1325 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1326
1327 for (iter = begin; iter < end; ) {
1328 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1329 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001330 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1331 && (iter+1) < end
1332 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 {
Victor Stinner551ac952011-11-29 22:58:13 +01001334 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 iter += 2;
1336 }
1337 else {
1338 *ucs4_out++ = *iter;
1339 iter++;
1340 }
1341 }
1342 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1343 _PyUnicode_GET_LENGTH(unicode)));
1344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345}
1346#endif
1347
Victor Stinnercd9950f2011-10-02 00:34:53 +02001348static int
Victor Stinner488fa492011-12-12 00:01:39 +01001349unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001350{
Victor Stinner488fa492011-12-12 00:01:39 +01001351 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001352 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001353 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001354 return -1;
1355 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001356 return 0;
1357}
1358
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001359static int
1360_copy_characters(PyObject *to, Py_ssize_t to_start,
1361 PyObject *from, Py_ssize_t from_start,
1362 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001364 unsigned int from_kind, to_kind;
1365 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366
Victor Stinneree4544c2012-05-09 22:24:08 +02001367 assert(0 <= how_many);
1368 assert(0 <= from_start);
1369 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001371 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001372 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373
Victor Stinnerd3f08822012-05-29 12:57:52 +02001374 assert(PyUnicode_Check(to));
1375 assert(PyUnicode_IS_READY(to));
1376 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1377
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001378 if (how_many == 0)
1379 return 0;
1380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001382 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001384 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385
Victor Stinnerf1852262012-06-16 16:38:26 +02001386#ifdef Py_DEBUG
1387 if (!check_maxchar
1388 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1389 {
1390 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1391 Py_UCS4 ch;
1392 Py_ssize_t i;
1393 for (i=0; i < how_many; i++) {
1394 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1395 assert(ch <= to_maxchar);
1396 }
1397 }
1398#endif
1399
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001400 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001401 if (check_maxchar
1402 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1403 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 /* Writing Latin-1 characters into an ASCII string requires to
1405 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001406 Py_UCS4 max_char;
1407 max_char = ucs1lib_find_max_char(from_data,
1408 (Py_UCS1*)from_data + how_many);
1409 if (max_char >= 128)
1410 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001411 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001412 Py_MEMCPY((char*)to_data + to_kind * to_start,
1413 (char*)from_data + from_kind * from_start,
1414 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001416 else if (from_kind == PyUnicode_1BYTE_KIND
1417 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001418 {
1419 _PyUnicode_CONVERT_BYTES(
1420 Py_UCS1, Py_UCS2,
1421 PyUnicode_1BYTE_DATA(from) + from_start,
1422 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1423 PyUnicode_2BYTE_DATA(to) + to_start
1424 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001425 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001426 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001427 && to_kind == PyUnicode_4BYTE_KIND)
1428 {
1429 _PyUnicode_CONVERT_BYTES(
1430 Py_UCS1, Py_UCS4,
1431 PyUnicode_1BYTE_DATA(from) + from_start,
1432 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1433 PyUnicode_4BYTE_DATA(to) + to_start
1434 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001435 }
1436 else if (from_kind == PyUnicode_2BYTE_KIND
1437 && to_kind == PyUnicode_4BYTE_KIND)
1438 {
1439 _PyUnicode_CONVERT_BYTES(
1440 Py_UCS2, Py_UCS4,
1441 PyUnicode_2BYTE_DATA(from) + from_start,
1442 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1443 PyUnicode_4BYTE_DATA(to) + to_start
1444 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001445 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001446 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001447 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1448
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001449 if (!check_maxchar) {
1450 if (from_kind == PyUnicode_2BYTE_KIND
1451 && to_kind == PyUnicode_1BYTE_KIND)
1452 {
1453 _PyUnicode_CONVERT_BYTES(
1454 Py_UCS2, Py_UCS1,
1455 PyUnicode_2BYTE_DATA(from) + from_start,
1456 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1457 PyUnicode_1BYTE_DATA(to) + to_start
1458 );
1459 }
1460 else if (from_kind == PyUnicode_4BYTE_KIND
1461 && to_kind == PyUnicode_1BYTE_KIND)
1462 {
1463 _PyUnicode_CONVERT_BYTES(
1464 Py_UCS4, Py_UCS1,
1465 PyUnicode_4BYTE_DATA(from) + from_start,
1466 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1467 PyUnicode_1BYTE_DATA(to) + to_start
1468 );
1469 }
1470 else if (from_kind == PyUnicode_4BYTE_KIND
1471 && to_kind == PyUnicode_2BYTE_KIND)
1472 {
1473 _PyUnicode_CONVERT_BYTES(
1474 Py_UCS4, Py_UCS2,
1475 PyUnicode_4BYTE_DATA(from) + from_start,
1476 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1477 PyUnicode_2BYTE_DATA(to) + to_start
1478 );
1479 }
1480 else {
1481 assert(0);
1482 return -1;
1483 }
1484 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001485 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001486 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001487 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 Py_ssize_t i;
1489
Victor Stinnera0702ab2011-09-29 14:14:38 +02001490 for (i=0; i < how_many; i++) {
1491 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001492 if (ch > to_maxchar)
1493 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001494 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1495 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001496 }
1497 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001498 return 0;
1499}
1500
Victor Stinnerd3f08822012-05-29 12:57:52 +02001501void
1502_PyUnicode_FastCopyCharacters(
1503 PyObject *to, Py_ssize_t to_start,
1504 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001505{
1506 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1507}
1508
1509Py_ssize_t
1510PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1511 PyObject *from, Py_ssize_t from_start,
1512 Py_ssize_t how_many)
1513{
1514 int err;
1515
1516 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1517 PyErr_BadInternalCall();
1518 return -1;
1519 }
1520
Benjamin Petersonbac79492012-01-14 13:34:47 -05001521 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001522 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001523 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return -1;
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526 if (from_start < 0) {
1527 PyErr_SetString(PyExc_IndexError, "string index out of range");
1528 return -1;
1529 }
1530 if (to_start < 0) {
1531 PyErr_SetString(PyExc_IndexError, "string index out of range");
1532 return -1;
1533 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001534 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1535 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1536 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001537 "Cannot write %zi characters at %zi "
1538 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 how_many, to_start, PyUnicode_GET_LENGTH(to));
1540 return -1;
1541 }
1542
1543 if (how_many == 0)
1544 return 0;
1545
Victor Stinner488fa492011-12-12 00:01:39 +01001546 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
1548
1549 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1550 if (err) {
1551 PyErr_Format(PyExc_SystemError,
1552 "Cannot copy %s characters "
1553 "into a string of %s characters",
1554 unicode_kind_name(from),
1555 unicode_kind_name(to));
1556 return -1;
1557 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001558 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559}
1560
Victor Stinner17222162011-09-28 22:15:37 +02001561/* Find the maximum code point and count the number of surrogate pairs so a
1562 correct string length can be computed before converting a string to UCS4.
1563 This function counts single surrogates as a character and not as a pair.
1564
1565 Return 0 on success, or -1 on error. */
1566static int
1567find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1568 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569{
1570 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001571 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572
Victor Stinnerc53be962011-10-02 21:33:54 +02001573 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 *num_surrogates = 0;
1575 *maxchar = 0;
1576
1577 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001579 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1580 && (iter+1) < end
1581 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1582 {
1583 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1584 ++(*num_surrogates);
1585 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 }
1587 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001589 {
1590 ch = *iter;
1591 iter++;
1592 }
1593 if (ch > *maxchar) {
1594 *maxchar = ch;
1595 if (*maxchar > MAX_UNICODE) {
1596 PyErr_Format(PyExc_ValueError,
1597 "character U+%x is not in range [U+0000; U+10ffff]",
1598 ch);
1599 return -1;
1600 }
1601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 }
1603 return 0;
1604}
1605
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001606int
1607_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608{
1609 wchar_t *end;
1610 Py_UCS4 maxchar = 0;
1611 Py_ssize_t num_surrogates;
1612#if SIZEOF_WCHAR_T == 2
1613 Py_ssize_t length_wo_surrogates;
1614#endif
1615
Georg Brandl7597add2011-10-05 16:36:47 +02001616 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001617 strings were created using _PyObject_New() and where no canonical
1618 representation (the str field) has been set yet aka strings
1619 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001620 assert(_PyUnicode_CHECK(unicode));
1621 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001623 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001624 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001625 /* Actually, it should neither be interned nor be anything else: */
1626 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001629 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001630 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632
1633 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001634 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1635 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 PyErr_NoMemory();
1637 return -1;
1638 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001639 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 _PyUnicode_WSTR(unicode), end,
1641 PyUnicode_1BYTE_DATA(unicode));
1642 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1643 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1644 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1645 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001646 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001647 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001648 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 }
1650 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001651 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001652 _PyUnicode_UTF8(unicode) = NULL;
1653 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 }
1655 PyObject_FREE(_PyUnicode_WSTR(unicode));
1656 _PyUnicode_WSTR(unicode) = NULL;
1657 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1658 }
1659 /* In this case we might have to convert down from 4-byte native
1660 wchar_t to 2-byte unicode. */
1661 else if (maxchar < 65536) {
1662 assert(num_surrogates == 0 &&
1663 "FindMaxCharAndNumSurrogatePairs() messed up");
1664
Victor Stinner506f5922011-09-28 22:34:18 +02001665#if SIZEOF_WCHAR_T == 2
1666 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001667 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001668 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1669 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1670 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001671 _PyUnicode_UTF8(unicode) = NULL;
1672 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001673#else
1674 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001675 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001676 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001678 PyErr_NoMemory();
1679 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 }
Victor Stinner506f5922011-09-28 22:34:18 +02001681 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1682 _PyUnicode_WSTR(unicode), end,
1683 PyUnicode_2BYTE_DATA(unicode));
1684 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1685 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1686 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001687 _PyUnicode_UTF8(unicode) = NULL;
1688 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001689 PyObject_FREE(_PyUnicode_WSTR(unicode));
1690 _PyUnicode_WSTR(unicode) = NULL;
1691 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1692#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1695 else {
1696#if SIZEOF_WCHAR_T == 2
1697 /* in case the native representation is 2-bytes, we need to allocate a
1698 new normalized 4-byte version. */
1699 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001700 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1701 PyErr_NoMemory();
1702 return -1;
1703 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 PyErr_NoMemory();
1707 return -1;
1708 }
1709 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1710 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001711 _PyUnicode_UTF8(unicode) = NULL;
1712 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001713 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1714 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001715 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 PyObject_FREE(_PyUnicode_WSTR(unicode));
1717 _PyUnicode_WSTR(unicode) = NULL;
1718 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1719#else
1720 assert(num_surrogates == 0);
1721
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001724 _PyUnicode_UTF8(unicode) = NULL;
1725 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1727#endif
1728 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1729 }
1730 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001731 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 return 0;
1733}
1734
Alexander Belopolsky40018472011-02-26 01:02:56 +00001735static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001736unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737{
Walter Dörwald16807132007-05-25 13:52:07 +00001738 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001739 case SSTATE_NOT_INTERNED:
1740 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001741
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 case SSTATE_INTERNED_MORTAL:
1743 /* revive dead object temporarily for DelItem */
1744 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001745 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001746 Py_FatalError(
1747 "deletion of interned string failed");
1748 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001749
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 case SSTATE_INTERNED_IMMORTAL:
1751 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001752
Benjamin Peterson29060642009-01-31 22:14:21 +00001753 default:
1754 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001755 }
1756
Victor Stinner03490912011-10-03 23:45:12 +02001757 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001759 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001760 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001761 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1762 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001764 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765}
1766
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001767#ifdef Py_DEBUG
1768static int
1769unicode_is_singleton(PyObject *unicode)
1770{
1771 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1772 if (unicode == unicode_empty)
1773 return 1;
1774 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1775 {
1776 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1777 if (ch < 256 && unicode_latin1[ch] == unicode)
1778 return 1;
1779 }
1780 return 0;
1781}
1782#endif
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784static int
Victor Stinner488fa492011-12-12 00:01:39 +01001785unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001786{
Victor Stinner488fa492011-12-12 00:01:39 +01001787 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001788 if (Py_REFCNT(unicode) != 1)
1789 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001790 if (_PyUnicode_HASH(unicode) != -1)
1791 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001792 if (PyUnicode_CHECK_INTERNED(unicode))
1793 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001794 if (!PyUnicode_CheckExact(unicode))
1795 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001796#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797 /* singleton refcount is greater than 1 */
1798 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001799#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001800 return 1;
1801}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001802
Victor Stinnerfe226c02011-10-03 03:52:20 +02001803static int
1804unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1805{
1806 PyObject *unicode;
1807 Py_ssize_t old_length;
1808
1809 assert(p_unicode != NULL);
1810 unicode = *p_unicode;
1811
1812 assert(unicode != NULL);
1813 assert(PyUnicode_Check(unicode));
1814 assert(0 <= length);
1815
Victor Stinner910337b2011-10-03 03:20:16 +02001816 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 old_length = PyUnicode_WSTR_LENGTH(unicode);
1818 else
1819 old_length = PyUnicode_GET_LENGTH(unicode);
1820 if (old_length == length)
1821 return 0;
1822
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001823 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001824 _Py_INCREF_UNICODE_EMPTY();
1825 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001826 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001827 Py_DECREF(*p_unicode);
1828 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001829 return 0;
1830 }
1831
Victor Stinner488fa492011-12-12 00:01:39 +01001832 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 PyObject *copy = resize_copy(unicode, length);
1834 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001835 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001836 Py_DECREF(*p_unicode);
1837 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001838 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001839 }
1840
Victor Stinnerfe226c02011-10-03 03:52:20 +02001841 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001842 PyObject *new_unicode = resize_compact(unicode, length);
1843 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001844 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001845 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001846 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001847 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001848 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001849}
1850
Alexander Belopolsky40018472011-02-26 01:02:56 +00001851int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001852PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001853{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001854 PyObject *unicode;
1855 if (p_unicode == NULL) {
1856 PyErr_BadInternalCall();
1857 return -1;
1858 }
1859 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001860 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001861 {
1862 PyErr_BadInternalCall();
1863 return -1;
1864 }
1865 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001866}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867
Victor Stinnerc5166102012-02-22 13:55:02 +01001868/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001869
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001870 WARNING: The function doesn't copy the terminating null character and
1871 doesn't check the maximum character (may write a latin1 character in an
1872 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001873static void
1874unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1875 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001876{
1877 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1878 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001879 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001880
1881 switch (kind) {
1882 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001883 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001884#ifdef Py_DEBUG
1885 if (PyUnicode_IS_ASCII(unicode)) {
1886 Py_UCS4 maxchar = ucs1lib_find_max_char(
1887 (const Py_UCS1*)str,
1888 (const Py_UCS1*)str + len);
1889 assert(maxchar < 128);
1890 }
1891#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001892 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001893 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001894 }
1895 case PyUnicode_2BYTE_KIND: {
1896 Py_UCS2 *start = (Py_UCS2 *)data + index;
1897 Py_UCS2 *ucs2 = start;
1898 assert(index <= PyUnicode_GET_LENGTH(unicode));
1899
Victor Stinner184252a2012-06-16 02:57:41 +02001900 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001901 *ucs2 = (Py_UCS2)*str;
1902
1903 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001904 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001905 }
1906 default: {
1907 Py_UCS4 *start = (Py_UCS4 *)data + index;
1908 Py_UCS4 *ucs4 = start;
1909 assert(kind == PyUnicode_4BYTE_KIND);
1910 assert(index <= PyUnicode_GET_LENGTH(unicode));
1911
Victor Stinner184252a2012-06-16 02:57:41 +02001912 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001913 *ucs4 = (Py_UCS4)*str;
1914
1915 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001916 }
1917 }
1918}
1919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920static PyObject*
1921get_latin1_char(unsigned char ch)
1922{
Victor Stinnera464fc12011-10-02 20:39:30 +02001923 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001925 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 if (!unicode)
1927 return NULL;
1928 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001929 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 unicode_latin1[ch] = unicode;
1931 }
1932 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001933 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934}
1935
Victor Stinner985a82a2014-01-03 12:53:47 +01001936static PyObject*
1937unicode_char(Py_UCS4 ch)
1938{
1939 PyObject *unicode;
1940
1941 assert(ch <= MAX_UNICODE);
1942
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001943 if (ch < 256)
1944 return get_latin1_char(ch);
1945
Victor Stinner985a82a2014-01-03 12:53:47 +01001946 unicode = PyUnicode_New(1, ch);
1947 if (unicode == NULL)
1948 return NULL;
1949 switch (PyUnicode_KIND(unicode)) {
1950 case PyUnicode_1BYTE_KIND:
1951 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1952 break;
1953 case PyUnicode_2BYTE_KIND:
1954 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1955 break;
1956 default:
1957 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1958 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1959 }
1960 assert(_PyUnicode_CheckConsistency(unicode, 1));
1961 return unicode;
1962}
1963
Alexander Belopolsky40018472011-02-26 01:02:56 +00001964PyObject *
1965PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001967 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 Py_UCS4 maxchar = 0;
1969 Py_ssize_t num_surrogates;
1970
1971 if (u == NULL)
1972 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001974 /* If the Unicode data is known at construction time, we can apply
1975 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001978 if (size == 0)
1979 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 /* Single character Unicode objects in the Latin-1 range are
1982 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001983 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 return get_latin1_char((unsigned char)*u);
1985
1986 /* If not empty and not single character, copy the Unicode data
1987 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001988 if (find_maxchar_surrogates(u, u + size,
1989 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return NULL;
1991
Victor Stinner8faf8212011-12-08 22:14:11 +01001992 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 if (!unicode)
1994 return NULL;
1995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 switch (PyUnicode_KIND(unicode)) {
1997 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001998 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2000 break;
2001 case PyUnicode_2BYTE_KIND:
2002#if Py_UNICODE_SIZE == 2
2003 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2004#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002005 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2007#endif
2008 break;
2009 case PyUnicode_4BYTE_KIND:
2010#if SIZEOF_WCHAR_T == 2
2011 /* This is the only case which has to process surrogates, thus
2012 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002013 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014#else
2015 assert(num_surrogates == 0);
2016 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2017#endif
2018 break;
2019 default:
2020 assert(0 && "Impossible state");
2021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024}
2025
Alexander Belopolsky40018472011-02-26 01:02:56 +00002026PyObject *
2027PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002028{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002029 if (size < 0) {
2030 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002032 return NULL;
2033 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002034 if (u != NULL)
2035 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2036 else
2037 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002038}
2039
Alexander Belopolsky40018472011-02-26 01:02:56 +00002040PyObject *
2041PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002042{
2043 size_t size = strlen(u);
2044 if (size > PY_SSIZE_T_MAX) {
2045 PyErr_SetString(PyExc_OverflowError, "input too long");
2046 return NULL;
2047 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002048 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002049}
2050
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002051PyObject *
2052_PyUnicode_FromId(_Py_Identifier *id)
2053{
2054 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002055 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2056 strlen(id->string),
2057 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002058 if (!id->object)
2059 return NULL;
2060 PyUnicode_InternInPlace(&id->object);
2061 assert(!id->next);
2062 id->next = static_strings;
2063 static_strings = id;
2064 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002065 return id->object;
2066}
2067
2068void
2069_PyUnicode_ClearStaticStrings()
2070{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002071 _Py_Identifier *tmp, *s = static_strings;
2072 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002073 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002074 tmp = s->next;
2075 s->next = NULL;
2076 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002077 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002078 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079}
2080
Benjamin Peterson0df54292012-03-26 14:50:32 -04002081/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002082
Victor Stinnerd3f08822012-05-29 12:57:52 +02002083PyObject*
2084_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002085{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002086 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002087 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002088 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002089#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002090 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002091#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002092 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002093 }
Victor Stinner785938e2011-12-11 20:09:03 +01002094 unicode = PyUnicode_New(size, 127);
2095 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002096 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002097 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2098 assert(_PyUnicode_CheckConsistency(unicode, 1));
2099 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002100}
2101
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002102static Py_UCS4
2103kind_maxchar_limit(unsigned int kind)
2104{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002105 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002106 case PyUnicode_1BYTE_KIND:
2107 return 0x80;
2108 case PyUnicode_2BYTE_KIND:
2109 return 0x100;
2110 case PyUnicode_4BYTE_KIND:
2111 return 0x10000;
2112 default:
2113 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002114 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002115 }
2116}
2117
Victor Stinnere6abb482012-05-02 01:15:40 +02002118Py_LOCAL_INLINE(Py_UCS4)
2119align_maxchar(Py_UCS4 maxchar)
2120{
2121 if (maxchar <= 127)
2122 return 127;
2123 else if (maxchar <= 255)
2124 return 255;
2125 else if (maxchar <= 65535)
2126 return 65535;
2127 else
2128 return MAX_UNICODE;
2129}
2130
Victor Stinner702c7342011-10-05 13:50:52 +02002131static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002132_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002135 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002136
Serhiy Storchaka678db842013-01-26 12:16:36 +02002137 if (size == 0)
2138 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002139 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002140 if (size == 1)
2141 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002143 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002144 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 if (!res)
2146 return NULL;
2147 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002148 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002150}
2151
Victor Stinnere57b1c02011-09-28 22:20:48 +02002152static PyObject*
2153_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154{
2155 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002156 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002157
Serhiy Storchaka678db842013-01-26 12:16:36 +02002158 if (size == 0)
2159 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002160 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002161 if (size == 1)
2162 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002164 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002165 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 if (!res)
2167 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002168 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002170 else {
2171 _PyUnicode_CONVERT_BYTES(
2172 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2173 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002174 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175 return res;
2176}
2177
Victor Stinnere57b1c02011-09-28 22:20:48 +02002178static PyObject*
2179_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180{
2181 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002182 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002183
Serhiy Storchaka678db842013-01-26 12:16:36 +02002184 if (size == 0)
2185 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002187 if (size == 1)
2188 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002189
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002190 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 if (!res)
2193 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002194 if (max_char < 256)
2195 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2196 PyUnicode_1BYTE_DATA(res));
2197 else if (max_char < 0x10000)
2198 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2199 PyUnicode_2BYTE_DATA(res));
2200 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
2206PyObject*
2207PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2208{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002209 if (size < 0) {
2210 PyErr_SetString(PyExc_ValueError, "size must be positive");
2211 return NULL;
2212 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002213 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002215 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002217 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002219 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002220 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002221 PyErr_SetString(PyExc_SystemError, "invalid kind");
2222 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224}
2225
Victor Stinnerece58de2012-04-23 23:36:38 +02002226Py_UCS4
2227_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2228{
2229 enum PyUnicode_Kind kind;
2230 void *startptr, *endptr;
2231
2232 assert(PyUnicode_IS_READY(unicode));
2233 assert(0 <= start);
2234 assert(end <= PyUnicode_GET_LENGTH(unicode));
2235 assert(start <= end);
2236
2237 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2238 return PyUnicode_MAX_CHAR_VALUE(unicode);
2239
2240 if (start == end)
2241 return 127;
2242
Victor Stinner94d558b2012-04-27 22:26:58 +02002243 if (PyUnicode_IS_ASCII(unicode))
2244 return 127;
2245
Victor Stinnerece58de2012-04-23 23:36:38 +02002246 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002247 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002248 endptr = (char *)startptr + end * kind;
2249 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002250 switch(kind) {
2251 case PyUnicode_1BYTE_KIND:
2252 return ucs1lib_find_max_char(startptr, endptr);
2253 case PyUnicode_2BYTE_KIND:
2254 return ucs2lib_find_max_char(startptr, endptr);
2255 case PyUnicode_4BYTE_KIND:
2256 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002257 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002258 assert(0);
2259 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002260 }
2261}
2262
Victor Stinner25a4b292011-10-06 12:31:55 +02002263/* Ensure that a string uses the most efficient storage, if it is not the
2264 case: create a new string with of the right kind. Write NULL into *p_unicode
2265 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002266static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002267unicode_adjust_maxchar(PyObject **p_unicode)
2268{
2269 PyObject *unicode, *copy;
2270 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002271 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002272 unsigned int kind;
2273
2274 assert(p_unicode != NULL);
2275 unicode = *p_unicode;
2276 assert(PyUnicode_IS_READY(unicode));
2277 if (PyUnicode_IS_ASCII(unicode))
2278 return;
2279
2280 len = PyUnicode_GET_LENGTH(unicode);
2281 kind = PyUnicode_KIND(unicode);
2282 if (kind == PyUnicode_1BYTE_KIND) {
2283 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002284 max_char = ucs1lib_find_max_char(u, u + len);
2285 if (max_char >= 128)
2286 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002287 }
2288 else if (kind == PyUnicode_2BYTE_KIND) {
2289 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002290 max_char = ucs2lib_find_max_char(u, u + len);
2291 if (max_char >= 256)
2292 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002293 }
2294 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002295 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002296 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002297 max_char = ucs4lib_find_max_char(u, u + len);
2298 if (max_char >= 0x10000)
2299 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002301 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002302 if (copy != NULL)
2303 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002304 Py_DECREF(unicode);
2305 *p_unicode = copy;
2306}
2307
Victor Stinner034f6cf2011-09-30 02:26:44 +02002308PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002309_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002310{
Victor Stinner87af4f22011-11-21 23:03:47 +01002311 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002312 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002313
Victor Stinner034f6cf2011-09-30 02:26:44 +02002314 if (!PyUnicode_Check(unicode)) {
2315 PyErr_BadInternalCall();
2316 return NULL;
2317 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002318 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002319 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002320
Victor Stinner87af4f22011-11-21 23:03:47 +01002321 length = PyUnicode_GET_LENGTH(unicode);
2322 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002323 if (!copy)
2324 return NULL;
2325 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2326
Victor Stinner87af4f22011-11-21 23:03:47 +01002327 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2328 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002329 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002330 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331}
2332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333
Victor Stinnerbc603d12011-10-02 01:00:40 +02002334/* Widen Unicode objects to larger buffers. Don't write terminating null
2335 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336
2337void*
2338_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2339{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002340 Py_ssize_t len;
2341 void *result;
2342 unsigned int skind;
2343
Benjamin Petersonbac79492012-01-14 13:34:47 -05002344 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002345 return NULL;
2346
2347 len = PyUnicode_GET_LENGTH(s);
2348 skind = PyUnicode_KIND(s);
2349 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002350 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return NULL;
2352 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002353 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002354 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002355 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002356 if (!result)
2357 return PyErr_NoMemory();
2358 assert(skind == PyUnicode_1BYTE_KIND);
2359 _PyUnicode_CONVERT_BYTES(
2360 Py_UCS1, Py_UCS2,
2361 PyUnicode_1BYTE_DATA(s),
2362 PyUnicode_1BYTE_DATA(s) + len,
2363 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002365 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002366 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002367 if (!result)
2368 return PyErr_NoMemory();
2369 if (skind == PyUnicode_2BYTE_KIND) {
2370 _PyUnicode_CONVERT_BYTES(
2371 Py_UCS2, Py_UCS4,
2372 PyUnicode_2BYTE_DATA(s),
2373 PyUnicode_2BYTE_DATA(s) + len,
2374 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002376 else {
2377 assert(skind == PyUnicode_1BYTE_KIND);
2378 _PyUnicode_CONVERT_BYTES(
2379 Py_UCS1, Py_UCS4,
2380 PyUnicode_1BYTE_DATA(s),
2381 PyUnicode_1BYTE_DATA(s) + len,
2382 result);
2383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002385 default:
2386 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 }
Victor Stinner01698042011-10-04 00:04:26 +02002388 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 return NULL;
2390}
2391
2392static Py_UCS4*
2393as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2394 int copy_null)
2395{
2396 int kind;
2397 void *data;
2398 Py_ssize_t len, targetlen;
2399 if (PyUnicode_READY(string) == -1)
2400 return NULL;
2401 kind = PyUnicode_KIND(string);
2402 data = PyUnicode_DATA(string);
2403 len = PyUnicode_GET_LENGTH(string);
2404 targetlen = len;
2405 if (copy_null)
2406 targetlen++;
2407 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002408 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 if (!target) {
2410 PyErr_NoMemory();
2411 return NULL;
2412 }
2413 }
2414 else {
2415 if (targetsize < targetlen) {
2416 PyErr_Format(PyExc_SystemError,
2417 "string is longer than the buffer");
2418 if (copy_null && 0 < targetsize)
2419 target[0] = 0;
2420 return NULL;
2421 }
2422 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002423 if (kind == PyUnicode_1BYTE_KIND) {
2424 Py_UCS1 *start = (Py_UCS1 *) data;
2425 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002427 else if (kind == PyUnicode_2BYTE_KIND) {
2428 Py_UCS2 *start = (Py_UCS2 *) data;
2429 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2430 }
2431 else {
2432 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 if (copy_null)
2436 target[len] = 0;
2437 return target;
2438}
2439
2440Py_UCS4*
2441PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2442 int copy_null)
2443{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002444 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 PyErr_BadInternalCall();
2446 return NULL;
2447 }
2448 return as_ucs4(string, target, targetsize, copy_null);
2449}
2450
2451Py_UCS4*
2452PyUnicode_AsUCS4Copy(PyObject *string)
2453{
2454 return as_ucs4(string, NULL, 0, 1);
2455}
2456
2457#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002458
Alexander Belopolsky40018472011-02-26 01:02:56 +00002459PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002460PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002464 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 PyErr_BadInternalCall();
2466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 }
2468
Martin v. Löwis790465f2008-04-05 20:41:37 +00002469 if (size == -1) {
2470 size = wcslen(w);
2471 }
2472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474}
2475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002477
Victor Stinner15a11362012-10-06 23:48:20 +02002478/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002479 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2480 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2481#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002482
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002483static int
2484unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2485 Py_ssize_t width, Py_ssize_t precision)
2486{
2487 Py_ssize_t length, fill, arglen;
2488 Py_UCS4 maxchar;
2489
2490 if (PyUnicode_READY(str) == -1)
2491 return -1;
2492
2493 length = PyUnicode_GET_LENGTH(str);
2494 if ((precision == -1 || precision >= length)
2495 && width <= length)
2496 return _PyUnicodeWriter_WriteStr(writer, str);
2497
2498 if (precision != -1)
2499 length = Py_MIN(precision, length);
2500
2501 arglen = Py_MAX(length, width);
2502 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2503 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2504 else
2505 maxchar = writer->maxchar;
2506
2507 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2508 return -1;
2509
2510 if (width > length) {
2511 fill = width - length;
2512 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2513 return -1;
2514 writer->pos += fill;
2515 }
2516
2517 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2518 str, 0, length);
2519 writer->pos += length;
2520 return 0;
2521}
2522
2523static int
2524unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2525 Py_ssize_t width, Py_ssize_t precision)
2526{
2527 /* UTF-8 */
2528 Py_ssize_t length;
2529 PyObject *unicode;
2530 int res;
2531
2532 length = strlen(str);
2533 if (precision != -1)
2534 length = Py_MIN(length, precision);
2535 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2536 if (unicode == NULL)
2537 return -1;
2538
2539 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2540 Py_DECREF(unicode);
2541 return res;
2542}
2543
Victor Stinner96865452011-03-01 23:44:09 +00002544static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002545unicode_fromformat_arg(_PyUnicodeWriter *writer,
2546 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002547{
Victor Stinnere215d962012-10-06 23:03:36 +02002548 const char *p;
2549 Py_ssize_t len;
2550 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002551 Py_ssize_t width;
2552 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002553 int longflag;
2554 int longlongflag;
2555 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002556 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002557
2558 p = f;
2559 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002560 zeropad = 0;
2561 if (*f == '0') {
2562 zeropad = 1;
2563 f++;
2564 }
Victor Stinner96865452011-03-01 23:44:09 +00002565
2566 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002567 width = -1;
2568 if (Py_ISDIGIT((unsigned)*f)) {
2569 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002570 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002571 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002573 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002575 return NULL;
2576 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002577 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002578 f++;
2579 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 }
2581 precision = -1;
2582 if (*f == '.') {
2583 f++;
2584 if (Py_ISDIGIT((unsigned)*f)) {
2585 precision = (*f - '0');
2586 f++;
2587 while (Py_ISDIGIT((unsigned)*f)) {
2588 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2589 PyErr_SetString(PyExc_ValueError,
2590 "precision too big");
2591 return NULL;
2592 }
2593 precision = (precision * 10) + (*f - '0');
2594 f++;
2595 }
2596 }
Victor Stinner96865452011-03-01 23:44:09 +00002597 if (*f == '%') {
2598 /* "%.3%s" => f points to "3" */
2599 f--;
2600 }
2601 }
2602 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002603 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002604 f--;
2605 }
Victor Stinner96865452011-03-01 23:44:09 +00002606
2607 /* Handle %ld, %lu, %lld and %llu. */
2608 longflag = 0;
2609 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002610 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002611 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002612 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002613 longflag = 1;
2614 ++f;
2615 }
2616#ifdef HAVE_LONG_LONG
2617 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002618 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002619 longlongflag = 1;
2620 f += 2;
2621 }
2622#endif
2623 }
2624 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002625 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002626 size_tflag = 1;
2627 ++f;
2628 }
Victor Stinnere215d962012-10-06 23:03:36 +02002629
2630 if (f[1] == '\0')
2631 writer->overallocate = 0;
2632
2633 switch (*f) {
2634 case 'c':
2635 {
2636 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002637 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002638 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002639 "character argument not in range(0x110000)");
2640 return NULL;
2641 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002642 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002643 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002644 break;
2645 }
2646
2647 case 'i':
2648 case 'd':
2649 case 'u':
2650 case 'x':
2651 {
2652 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002653 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002654 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002657 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002658 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002659 va_arg(*vargs, unsigned long));
2660#ifdef HAVE_LONG_LONG
2661 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002662 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002663 va_arg(*vargs, unsigned PY_LONG_LONG));
2664#endif
2665 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002666 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002667 va_arg(*vargs, size_t));
2668 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002669 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002670 va_arg(*vargs, unsigned int));
2671 }
2672 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002673 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002674 }
2675 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002677 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002678 va_arg(*vargs, long));
2679#ifdef HAVE_LONG_LONG
2680 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002681 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002682 va_arg(*vargs, PY_LONG_LONG));
2683#endif
2684 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, Py_ssize_t));
2687 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, int));
2690 }
2691 assert(len >= 0);
2692
Victor Stinnere215d962012-10-06 23:03:36 +02002693 if (precision < len)
2694 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002695
2696 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002697 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2698 return NULL;
2699
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (width > precision) {
2701 Py_UCS4 fillchar;
2702 fill = width - precision;
2703 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002704 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2705 return NULL;
2706 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002707 }
Victor Stinner15a11362012-10-06 23:48:20 +02002708 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002709 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002710 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2711 return NULL;
2712 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002713 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714
Victor Stinner4a587072013-11-19 12:54:53 +01002715 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2716 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002717 break;
2718 }
2719
2720 case 'p':
2721 {
2722 char number[MAX_LONG_LONG_CHARS];
2723
2724 len = sprintf(number, "%p", va_arg(*vargs, void*));
2725 assert(len >= 0);
2726
2727 /* %p is ill-defined: ensure leading 0x. */
2728 if (number[1] == 'X')
2729 number[1] = 'x';
2730 else if (number[1] != 'x') {
2731 memmove(number + 2, number,
2732 strlen(number) + 1);
2733 number[0] = '0';
2734 number[1] = 'x';
2735 len += 2;
2736 }
2737
Victor Stinner4a587072013-11-19 12:54:53 +01002738 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002739 return NULL;
2740 break;
2741 }
2742
2743 case 's':
2744 {
2745 /* UTF-8 */
2746 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002748 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002749 break;
2750 }
2751
2752 case 'U':
2753 {
2754 PyObject *obj = va_arg(*vargs, PyObject *);
2755 assert(obj && _PyUnicode_CHECK(obj));
2756
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002758 return NULL;
2759 break;
2760 }
2761
2762 case 'V':
2763 {
2764 PyObject *obj = va_arg(*vargs, PyObject *);
2765 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002766 if (obj) {
2767 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002768 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002769 return NULL;
2770 }
2771 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002772 assert(str != NULL);
2773 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002774 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002775 }
2776 break;
2777 }
2778
2779 case 'S':
2780 {
2781 PyObject *obj = va_arg(*vargs, PyObject *);
2782 PyObject *str;
2783 assert(obj);
2784 str = PyObject_Str(obj);
2785 if (!str)
2786 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002787 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002788 Py_DECREF(str);
2789 return NULL;
2790 }
2791 Py_DECREF(str);
2792 break;
2793 }
2794
2795 case 'R':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 PyObject *repr;
2799 assert(obj);
2800 repr = PyObject_Repr(obj);
2801 if (!repr)
2802 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002804 Py_DECREF(repr);
2805 return NULL;
2806 }
2807 Py_DECREF(repr);
2808 break;
2809 }
2810
2811 case 'A':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 PyObject *ascii;
2815 assert(obj);
2816 ascii = PyObject_ASCII(obj);
2817 if (!ascii)
2818 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002819 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002820 Py_DECREF(ascii);
2821 return NULL;
2822 }
2823 Py_DECREF(ascii);
2824 break;
2825 }
2826
2827 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002828 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002829 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002830 break;
2831
2832 default:
2833 /* if we stumble upon an unknown formatting code, copy the rest
2834 of the format string to the output string. (we cannot just
2835 skip the code, since there's no way to know what's in the
2836 argument list) */
2837 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002838 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002839 return NULL;
2840 f = p+len;
2841 return f;
2842 }
2843
2844 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002845 return f;
2846}
2847
Walter Dörwaldd2034312007-05-18 16:29:38 +00002848PyObject *
2849PyUnicode_FromFormatV(const char *format, va_list vargs)
2850{
Victor Stinnere215d962012-10-06 23:03:36 +02002851 va_list vargs2;
2852 const char *f;
2853 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854
Victor Stinner8f674cc2013-04-17 23:02:17 +02002855 _PyUnicodeWriter_Init(&writer);
2856 writer.min_length = strlen(format) + 100;
2857 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002858
2859 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2860 Copy it to be able to pass a reference to a subfunction. */
2861 Py_VA_COPY(vargs2, vargs);
2862
2863 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002864 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002865 f = unicode_fromformat_arg(&writer, f, &vargs2);
2866 if (f == NULL)
2867 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002869 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002870 const char *p;
2871 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
Victor Stinnere215d962012-10-06 23:03:36 +02002873 p = f;
2874 do
2875 {
2876 if ((unsigned char)*p > 127) {
2877 PyErr_Format(PyExc_ValueError,
2878 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2879 "string, got a non-ASCII byte: 0x%02x",
2880 (unsigned char)*p);
2881 return NULL;
2882 }
2883 p++;
2884 }
2885 while (*p != '\0' && *p != '%');
2886 len = p - f;
2887
2888 if (*p == '\0')
2889 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002890
2891 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002892 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002893
2894 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002895 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 }
Victor Stinnere215d962012-10-06 23:03:36 +02002897 return _PyUnicodeWriter_Finish(&writer);
2898
2899 fail:
2900 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002901 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002902}
2903
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904PyObject *
2905PyUnicode_FromFormat(const char *format, ...)
2906{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 PyObject* ret;
2908 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002909
2910#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002911 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002912#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002914#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002915 ret = PyUnicode_FromFormatV(format, vargs);
2916 va_end(vargs);
2917 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918}
2919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920#ifdef HAVE_WCHAR_H
2921
Victor Stinner5593d8a2010-10-02 11:11:27 +00002922/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2923 convert a Unicode object to a wide character string.
2924
Victor Stinnerd88d9832011-09-06 02:00:05 +02002925 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002926 character) required to convert the unicode object. Ignore size argument.
2927
Victor Stinnerd88d9832011-09-06 02:00:05 +02002928 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002929 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002930 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002931static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002932unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002933 wchar_t *w,
2934 Py_ssize_t size)
2935{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002936 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937 const wchar_t *wstr;
2938
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002939 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 if (wstr == NULL)
2941 return -1;
2942
Victor Stinner5593d8a2010-10-02 11:11:27 +00002943 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 if (size > res)
2945 size = res + 1;
2946 else
2947 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 return res;
2950 }
2951 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002953}
2954
2955Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002956PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002957 wchar_t *w,
2958 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959{
2960 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002961 PyErr_BadInternalCall();
2962 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002964 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965}
2966
Victor Stinner137c34c2010-09-29 10:25:54 +00002967wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002968PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002969 Py_ssize_t *size)
2970{
2971 wchar_t* buffer;
2972 Py_ssize_t buflen;
2973
2974 if (unicode == NULL) {
2975 PyErr_BadInternalCall();
2976 return NULL;
2977 }
2978
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002979 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002980 if (buflen == -1)
2981 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002982 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002983 if (buffer == NULL) {
2984 PyErr_NoMemory();
2985 return NULL;
2986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002988 if (buflen == -1) {
2989 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002990 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002991 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002992 if (size != NULL)
2993 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002994 return buffer;
2995}
2996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002997#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998
Alexander Belopolsky40018472011-02-26 01:02:56 +00002999PyObject *
3000PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003001{
Victor Stinner8faf8212011-12-08 22:14:11 +01003002 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 PyErr_SetString(PyExc_ValueError,
3004 "chr() arg not in range(0x110000)");
3005 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003006 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003007
Victor Stinner985a82a2014-01-03 12:53:47 +01003008 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003009}
3010
Alexander Belopolsky40018472011-02-26 01:02:56 +00003011PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003012PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003014 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003016 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003017 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003018 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 Py_INCREF(obj);
3020 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003021 }
3022 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 /* For a Unicode subtype that's not a Unicode object,
3024 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003025 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003026 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003027 PyErr_Format(PyExc_TypeError,
3028 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003029 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003030 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003031}
3032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003034PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003035 const char *encoding,
3036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003037{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003038 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003039 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003040
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 PyErr_BadInternalCall();
3043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003045
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003046 /* Decoding bytes objects is the most common case and should be fast */
3047 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003048 if (PyBytes_GET_SIZE(obj) == 0)
3049 _Py_RETURN_UNICODE_EMPTY();
3050 v = PyUnicode_Decode(
3051 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3052 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003053 return v;
3054 }
3055
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003056 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 PyErr_SetString(PyExc_TypeError,
3058 "decoding str is not supported");
3059 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003061
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003062 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3063 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3064 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02003065 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003066 Py_TYPE(obj)->tp_name);
3067 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003068 }
Tim Petersced69f82003-09-16 20:30:58 +00003069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 PyBuffer_Release(&buffer);
3072 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003074
Serhiy Storchaka05997252013-01-26 12:14:02 +02003075 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003077 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078}
3079
Victor Stinner600d3be2010-06-10 12:00:55 +00003080/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003081 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3082 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003083int
3084_Py_normalize_encoding(const char *encoding,
3085 char *lower,
3086 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003088 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003089 char *l;
3090 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003092 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01003093 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01003094 if (lower_len < 6)
3095 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003096 strcpy(lower, "utf-8");
3097 return 1;
3098 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003099 e = encoding;
3100 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003101 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003102 while (*e) {
3103 if (l == l_end)
3104 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003105 if (Py_ISUPPER(*e)) {
3106 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003107 }
3108 else if (*e == '_') {
3109 *l++ = '-';
3110 e++;
3111 }
3112 else {
3113 *l++ = *e++;
3114 }
3115 }
3116 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003117 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003118}
3119
Alexander Belopolsky40018472011-02-26 01:02:56 +00003120PyObject *
3121PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003122 Py_ssize_t size,
3123 const char *encoding,
3124 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003125{
3126 PyObject *buffer = NULL, *unicode;
3127 Py_buffer info;
3128 char lower[11]; /* Enough for any encoding shortcut */
3129
Fred Drakee4315f52000-05-09 19:53:39 +00003130 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003131 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003132 if ((strcmp(lower, "utf-8") == 0) ||
3133 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003134 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003135 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003136 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003137 (strcmp(lower, "iso-8859-1") == 0) ||
3138 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003139 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003140#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003141 else if (strcmp(lower, "mbcs") == 0)
3142 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003143#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003144 else if (strcmp(lower, "ascii") == 0)
3145 return PyUnicode_DecodeASCII(s, size, errors);
3146 else if (strcmp(lower, "utf-16") == 0)
3147 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3148 else if (strcmp(lower, "utf-32") == 0)
3149 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151
3152 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003153 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003154 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003155 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003156 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 if (buffer == NULL)
3158 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003159 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 if (unicode == NULL)
3161 goto onError;
3162 if (!PyUnicode_Check(unicode)) {
3163 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003164 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3165 "use codecs.decode() to decode to arbitrary types",
3166 encoding,
3167 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 Py_DECREF(unicode);
3169 goto onError;
3170 }
3171 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003172 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003173
Benjamin Peterson29060642009-01-31 22:14:21 +00003174 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 Py_XDECREF(buffer);
3176 return NULL;
3177}
3178
Alexander Belopolsky40018472011-02-26 01:02:56 +00003179PyObject *
3180PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003181 const char *encoding,
3182 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003183{
3184 PyObject *v;
3185
3186 if (!PyUnicode_Check(unicode)) {
3187 PyErr_BadArgument();
3188 goto onError;
3189 }
3190
3191 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003193
3194 /* Decode via the codec registry */
3195 v = PyCodec_Decode(unicode, encoding, errors);
3196 if (v == NULL)
3197 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003198 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003199
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003201 return NULL;
3202}
3203
Alexander Belopolsky40018472011-02-26 01:02:56 +00003204PyObject *
3205PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003206 const char *encoding,
3207 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003208{
3209 PyObject *v;
3210
3211 if (!PyUnicode_Check(unicode)) {
3212 PyErr_BadArgument();
3213 goto onError;
3214 }
3215
3216 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003218
3219 /* Decode via the codec registry */
3220 v = PyCodec_Decode(unicode, encoding, errors);
3221 if (v == NULL)
3222 goto onError;
3223 if (!PyUnicode_Check(v)) {
3224 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003225 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3226 "use codecs.decode() to decode to arbitrary types",
3227 encoding,
3228 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003229 Py_DECREF(v);
3230 goto onError;
3231 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003232 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003233
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003235 return NULL;
3236}
3237
Alexander Belopolsky40018472011-02-26 01:02:56 +00003238PyObject *
3239PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003240 Py_ssize_t size,
3241 const char *encoding,
3242 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243{
3244 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003245
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 unicode = PyUnicode_FromUnicode(s, size);
3247 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3250 Py_DECREF(unicode);
3251 return v;
3252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 const char *encoding,
3257 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003258{
3259 PyObject *v;
3260
3261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 goto onError;
3264 }
3265
3266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003268
3269 /* Encode via the codec registry */
3270 v = PyCodec_Encode(unicode, encoding, errors);
3271 if (v == NULL)
3272 goto onError;
3273 return v;
3274
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003276 return NULL;
3277}
3278
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279static size_t
3280wcstombs_errorpos(const wchar_t *wstr)
3281{
3282 size_t len;
3283#if SIZEOF_WCHAR_T == 2
3284 wchar_t buf[3];
3285#else
3286 wchar_t buf[2];
3287#endif
3288 char outbuf[MB_LEN_MAX];
3289 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291#if SIZEOF_WCHAR_T == 2
3292 buf[2] = 0;
3293#else
3294 buf[1] = 0;
3295#endif
3296 start = wstr;
3297 while (*wstr != L'\0')
3298 {
3299 previous = wstr;
3300#if SIZEOF_WCHAR_T == 2
3301 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3302 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3303 {
3304 buf[0] = wstr[0];
3305 buf[1] = wstr[1];
3306 wstr += 2;
3307 }
3308 else {
3309 buf[0] = *wstr;
3310 buf[1] = 0;
3311 wstr++;
3312 }
3313#else
3314 buf[0] = *wstr;
3315 wstr++;
3316#endif
3317 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003318 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003319 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320 }
3321
3322 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003323 return 0;
3324}
3325
Victor Stinner1b579672011-12-17 05:47:23 +01003326static int
3327locale_error_handler(const char *errors, int *surrogateescape)
3328{
Victor Stinner50149202015-09-22 00:26:54 +02003329 _Py_error_handler error_handler = get_error_handler(errors);
3330 switch (error_handler)
3331 {
3332 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003333 *surrogateescape = 0;
3334 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003335 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003336 *surrogateescape = 1;
3337 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003338 default:
3339 PyErr_Format(PyExc_ValueError,
3340 "only 'strict' and 'surrogateescape' error handlers "
3341 "are supported, not '%s'",
3342 errors);
3343 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003344 }
Victor Stinner1b579672011-12-17 05:47:23 +01003345}
3346
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003347PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003348PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003349{
3350 Py_ssize_t wlen, wlen2;
3351 wchar_t *wstr;
3352 PyObject *bytes = NULL;
3353 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003354 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003355 PyObject *exc;
3356 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003357 int surrogateescape;
3358
3359 if (locale_error_handler(errors, &surrogateescape) < 0)
3360 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003361
3362 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3363 if (wstr == NULL)
3364 return NULL;
3365
3366 wlen2 = wcslen(wstr);
3367 if (wlen2 != wlen) {
3368 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003369 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 return NULL;
3371 }
3372
3373 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003374 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003375 char *str;
3376
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003377 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 if (str == NULL) {
3379 if (error_pos == (size_t)-1) {
3380 PyErr_NoMemory();
3381 PyMem_Free(wstr);
3382 return NULL;
3383 }
3384 else {
3385 goto encode_error;
3386 }
3387 }
3388 PyMem_Free(wstr);
3389
3390 bytes = PyBytes_FromString(str);
3391 PyMem_Free(str);
3392 }
3393 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003394 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003395 size_t len, len2;
3396
3397 len = wcstombs(NULL, wstr, 0);
3398 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003399 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003400 goto encode_error;
3401 }
3402
3403 bytes = PyBytes_FromStringAndSize(NULL, len);
3404 if (bytes == NULL) {
3405 PyMem_Free(wstr);
3406 return NULL;
3407 }
3408
3409 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3410 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003411 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412 goto encode_error;
3413 }
3414 PyMem_Free(wstr);
3415 }
3416 return bytes;
3417
3418encode_error:
3419 errmsg = strerror(errno);
3420 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003421
3422 if (error_pos == (size_t)-1)
3423 error_pos = wcstombs_errorpos(wstr);
3424
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 PyMem_Free(wstr);
3426 Py_XDECREF(bytes);
3427
Victor Stinner2f197072011-12-17 07:08:30 +01003428 if (errmsg != NULL) {
3429 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003430 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003431 if (wstr != NULL) {
3432 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003433 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003434 } else
3435 errmsg = NULL;
3436 }
3437 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003438 reason = PyUnicode_FromString(
3439 "wcstombs() encountered an unencodable "
3440 "wide character");
3441 if (reason == NULL)
3442 return NULL;
3443
3444 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3445 "locale", unicode,
3446 (Py_ssize_t)error_pos,
3447 (Py_ssize_t)(error_pos+1),
3448 reason);
3449 Py_DECREF(reason);
3450 if (exc != NULL) {
3451 PyCodec_StrictErrors(exc);
3452 Py_XDECREF(exc);
3453 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003454 return NULL;
3455}
3456
Victor Stinnerad158722010-10-27 00:25:46 +00003457PyObject *
3458PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003459{
Victor Stinner99b95382011-07-04 14:23:54 +02003460#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003461 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003462#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003464#else
Victor Stinner793b5312011-04-27 00:24:21 +02003465 PyInterpreterState *interp = PyThreadState_GET()->interp;
3466 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3467 cannot use it to encode and decode filenames before it is loaded. Load
3468 the Python codec requires to encode at least its own filename. Use the C
3469 version of the locale codec until the codec registry is initialized and
3470 the Python codec is loaded.
3471
3472 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3473 cannot only rely on it: check also interp->fscodec_initialized for
3474 subinterpreters. */
3475 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003476 return PyUnicode_AsEncodedString(unicode,
3477 Py_FileSystemDefaultEncoding,
3478 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003479 }
3480 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003481 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003482 }
Victor Stinnerad158722010-10-27 00:25:46 +00003483#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003484}
3485
Alexander Belopolsky40018472011-02-26 01:02:56 +00003486PyObject *
3487PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003488 const char *encoding,
3489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
3491 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003492 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 if (!PyUnicode_Check(unicode)) {
3495 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 }
Fred Drakee4315f52000-05-09 19:53:39 +00003498
Fred Drakee4315f52000-05-09 19:53:39 +00003499 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003500 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003501 if ((strcmp(lower, "utf-8") == 0) ||
3502 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003503 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003504 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003506 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003507 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003508 }
Victor Stinner37296e82010-06-10 13:36:23 +00003509 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003510 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003511 (strcmp(lower, "iso-8859-1") == 0) ||
3512 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003514#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003515 else if (strcmp(lower, "mbcs") == 0)
3516 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003517#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003518 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521
3522 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003523 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003525 return NULL;
3526
3527 /* The normal path */
3528 if (PyBytes_Check(v))
3529 return v;
3530
3531 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003532 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003533 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003534 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003535
3536 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003537 "encoder %s returned bytearray instead of bytes; "
3538 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003539 encoding);
3540 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003541 Py_DECREF(v);
3542 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003543 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003545 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3546 Py_DECREF(v);
3547 return b;
3548 }
3549
3550 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003551 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3552 "use codecs.encode() to encode to arbitrary types",
3553 encoding,
3554 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003555 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003556 return NULL;
3557}
3558
Alexander Belopolsky40018472011-02-26 01:02:56 +00003559PyObject *
3560PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003561 const char *encoding,
3562 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003563{
3564 PyObject *v;
3565
3566 if (!PyUnicode_Check(unicode)) {
3567 PyErr_BadArgument();
3568 goto onError;
3569 }
3570
3571 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003573
3574 /* Encode via the codec registry */
3575 v = PyCodec_Encode(unicode, encoding, errors);
3576 if (v == NULL)
3577 goto onError;
3578 if (!PyUnicode_Check(v)) {
3579 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003580 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3581 "use codecs.encode() to encode to arbitrary types",
3582 encoding,
3583 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003584 Py_DECREF(v);
3585 goto onError;
3586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003588
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 return NULL;
3591}
3592
Victor Stinner2f197072011-12-17 07:08:30 +01003593static size_t
3594mbstowcs_errorpos(const char *str, size_t len)
3595{
3596#ifdef HAVE_MBRTOWC
3597 const char *start = str;
3598 mbstate_t mbs;
3599 size_t converted;
3600 wchar_t ch;
3601
3602 memset(&mbs, 0, sizeof mbs);
3603 while (len)
3604 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003605 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003606 if (converted == 0)
3607 /* Reached end of string */
3608 break;
3609 if (converted == (size_t)-1 || converted == (size_t)-2) {
3610 /* Conversion error or incomplete character */
3611 return str - start;
3612 }
3613 else {
3614 str += converted;
3615 len -= converted;
3616 }
3617 }
3618 /* failed to find the undecodable byte sequence */
3619 return 0;
3620#endif
3621 return 0;
3622}
3623
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003624PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003625PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003626 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003627{
3628 wchar_t smallbuf[256];
3629 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3630 wchar_t *wstr;
3631 size_t wlen, wlen2;
3632 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003633 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003634 size_t error_pos;
3635 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003636 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3637 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003638
3639 if (locale_error_handler(errors, &surrogateescape) < 0)
3640 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003641
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003642 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3643 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003644 return NULL;
3645 }
3646
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003647 if (surrogateescape) {
3648 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003649 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003650 if (wstr == NULL) {
3651 if (wlen == (size_t)-1)
3652 PyErr_NoMemory();
3653 else
3654 PyErr_SetFromErrno(PyExc_OSError);
3655 return NULL;
3656 }
3657
3658 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003659 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003660 }
3661 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003662 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003663#ifndef HAVE_BROKEN_MBSTOWCS
3664 wlen = mbstowcs(NULL, str, 0);
3665#else
3666 wlen = len;
3667#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003668 if (wlen == (size_t)-1)
3669 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003670 if (wlen+1 <= smallbuf_len) {
3671 wstr = smallbuf;
3672 }
3673 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003674 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003675 if (!wstr)
3676 return PyErr_NoMemory();
3677 }
3678
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679 wlen2 = mbstowcs(wstr, str, wlen+1);
3680 if (wlen2 == (size_t)-1) {
3681 if (wstr != smallbuf)
3682 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003683 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003684 }
3685#ifdef HAVE_BROKEN_MBSTOWCS
3686 assert(wlen2 == wlen);
3687#endif
3688 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3689 if (wstr != smallbuf)
3690 PyMem_Free(wstr);
3691 }
3692 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003693
3694decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003695 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003696 errmsg = strerror(errno);
3697 assert(errmsg != NULL);
3698
3699 error_pos = mbstowcs_errorpos(str, len);
3700 if (errmsg != NULL) {
3701 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003702 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003703 if (wstr != NULL) {
3704 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003705 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003706 }
Victor Stinner2f197072011-12-17 07:08:30 +01003707 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003708 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003709 reason = PyUnicode_FromString(
3710 "mbstowcs() encountered an invalid multibyte sequence");
3711 if (reason == NULL)
3712 return NULL;
3713
3714 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3715 "locale", str, len,
3716 (Py_ssize_t)error_pos,
3717 (Py_ssize_t)(error_pos+1),
3718 reason);
3719 Py_DECREF(reason);
3720 if (exc != NULL) {
3721 PyCodec_StrictErrors(exc);
3722 Py_XDECREF(exc);
3723 }
3724 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003725}
3726
3727PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003728PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003729{
3730 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003731 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732}
3733
3734
3735PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003736PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003737 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003738 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3739}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003740
Christian Heimes5894ba72007-11-04 11:43:14 +00003741PyObject*
3742PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3743{
Victor Stinner99b95382011-07-04 14:23:54 +02003744#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003745 return PyUnicode_DecodeMBCS(s, size, NULL);
3746#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003747 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003748#else
Victor Stinner793b5312011-04-27 00:24:21 +02003749 PyInterpreterState *interp = PyThreadState_GET()->interp;
3750 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3751 cannot use it to encode and decode filenames before it is loaded. Load
3752 the Python codec requires to encode at least its own filename. Use the C
3753 version of the locale codec until the codec registry is initialized and
3754 the Python codec is loaded.
3755
3756 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3757 cannot only rely on it: check also interp->fscodec_initialized for
3758 subinterpreters. */
3759 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003760 return PyUnicode_Decode(s, size,
3761 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003762 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003763 }
3764 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003765 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003766 }
Victor Stinnerad158722010-10-27 00:25:46 +00003767#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003768}
3769
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770
3771int
3772PyUnicode_FSConverter(PyObject* arg, void* addr)
3773{
3774 PyObject *output = NULL;
3775 Py_ssize_t size;
3776 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003777 if (arg == NULL) {
3778 Py_DECREF(*(PyObject**)addr);
3779 return 1;
3780 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003781 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003782 output = arg;
3783 Py_INCREF(output);
3784 }
3785 else {
3786 arg = PyUnicode_FromObject(arg);
3787 if (!arg)
3788 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003789 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003790 Py_DECREF(arg);
3791 if (!output)
3792 return 0;
3793 if (!PyBytes_Check(output)) {
3794 Py_DECREF(output);
3795 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3796 return 0;
3797 }
3798 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003799 size = PyBytes_GET_SIZE(output);
3800 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003801 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003802 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003803 Py_DECREF(output);
3804 return 0;
3805 }
3806 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003807 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003808}
3809
3810
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003811int
3812PyUnicode_FSDecoder(PyObject* arg, void* addr)
3813{
3814 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003815 if (arg == NULL) {
3816 Py_DECREF(*(PyObject**)addr);
3817 return 1;
3818 }
3819 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003820 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003822 output = arg;
3823 Py_INCREF(output);
3824 }
3825 else {
3826 arg = PyBytes_FromObject(arg);
3827 if (!arg)
3828 return 0;
3829 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3830 PyBytes_GET_SIZE(arg));
3831 Py_DECREF(arg);
3832 if (!output)
3833 return 0;
3834 if (!PyUnicode_Check(output)) {
3835 Py_DECREF(output);
3836 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3837 return 0;
3838 }
3839 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003840 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003841 Py_DECREF(output);
3842 return 0;
3843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003845 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003846 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003847 Py_DECREF(output);
3848 return 0;
3849 }
3850 *(PyObject**)addr = output;
3851 return Py_CLEANUP_SUPPORTED;
3852}
3853
3854
Martin v. Löwis5b222132007-06-10 09:51:05 +00003855char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003857{
Christian Heimesf3863112007-11-22 07:46:41 +00003858 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003860 if (!PyUnicode_Check(unicode)) {
3861 PyErr_BadArgument();
3862 return NULL;
3863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003865 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003867 if (PyUnicode_UTF8(unicode) == NULL) {
3868 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3870 if (bytes == NULL)
3871 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3873 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003874 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 Py_DECREF(bytes);
3876 return NULL;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3879 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3880 PyBytes_AS_STRING(bytes),
3881 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 Py_DECREF(bytes);
3883 }
3884
3885 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003886 *psize = PyUnicode_UTF8_LENGTH(unicode);
3887 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003888}
3889
3890char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3894}
3895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896Py_UNICODE *
3897PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 const unsigned char *one_byte;
3900#if SIZEOF_WCHAR_T == 4
3901 const Py_UCS2 *two_bytes;
3902#else
3903 const Py_UCS4 *four_bytes;
3904 const Py_UCS4 *ucs4_end;
3905 Py_ssize_t num_surrogates;
3906#endif
3907 wchar_t *w;
3908 wchar_t *wchar_end;
3909
3910 if (!PyUnicode_Check(unicode)) {
3911 PyErr_BadArgument();
3912 return NULL;
3913 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003914 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 assert(_PyUnicode_KIND(unicode) != 0);
3917 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003919 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3922 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 num_surrogates = 0;
3924
3925 for (; four_bytes < ucs4_end; ++four_bytes) {
3926 if (*four_bytes > 0xFFFF)
3927 ++num_surrogates;
3928 }
3929
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003930 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3931 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3932 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 PyErr_NoMemory();
3934 return NULL;
3935 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003938 w = _PyUnicode_WSTR(unicode);
3939 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3940 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3942 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003943 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003945 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3946 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 }
3948 else
3949 *w = *four_bytes;
3950
3951 if (w > wchar_end) {
3952 assert(0 && "Miscalculated string end");
3953 }
3954 }
3955 *w = 0;
3956#else
3957 /* sizeof(wchar_t) == 4 */
3958 Py_FatalError("Impossible unicode object state, wstr and str "
3959 "should share memory already.");
3960 return NULL;
3961#endif
3962 }
3963 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003964 if ((size_t)_PyUnicode_LENGTH(unicode) >
3965 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3966 PyErr_NoMemory();
3967 return NULL;
3968 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003969 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3970 (_PyUnicode_LENGTH(unicode) + 1));
3971 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 PyErr_NoMemory();
3973 return NULL;
3974 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003975 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3976 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3977 w = _PyUnicode_WSTR(unicode);
3978 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3981 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 for (; w < wchar_end; ++one_byte, ++w)
3983 *w = *one_byte;
3984 /* null-terminate the wstr */
3985 *w = 0;
3986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003987 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 for (; w < wchar_end; ++two_bytes, ++w)
3991 *w = *two_bytes;
3992 /* null-terminate the wstr */
3993 *w = 0;
3994#else
3995 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 PyObject_FREE(_PyUnicode_WSTR(unicode));
3997 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 Py_FatalError("Impossible unicode object state, wstr "
3999 "and str should share memory already.");
4000 return NULL;
4001#endif
4002 }
4003 else {
4004 assert(0 && "This should never happen.");
4005 }
4006 }
4007 }
4008 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004009 *size = PyUnicode_WSTR_LENGTH(unicode);
4010 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004011}
4012
Alexander Belopolsky40018472011-02-26 01:02:56 +00004013Py_UNICODE *
4014PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017}
4018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019
Alexander Belopolsky40018472011-02-26 01:02:56 +00004020Py_ssize_t
4021PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022{
4023 if (!PyUnicode_Check(unicode)) {
4024 PyErr_BadArgument();
4025 goto onError;
4026 }
4027 return PyUnicode_GET_SIZE(unicode);
4028
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 return -1;
4031}
4032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033Py_ssize_t
4034PyUnicode_GetLength(PyObject *unicode)
4035{
Victor Stinner07621332012-06-16 04:53:46 +02004036 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 PyErr_BadArgument();
4038 return -1;
4039 }
Victor Stinner07621332012-06-16 04:53:46 +02004040 if (PyUnicode_READY(unicode) == -1)
4041 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 return PyUnicode_GET_LENGTH(unicode);
4043}
4044
4045Py_UCS4
4046PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4047{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004048 void *data;
4049 int kind;
4050
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004051 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4052 PyErr_BadArgument();
4053 return (Py_UCS4)-1;
4054 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004055 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004056 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 return (Py_UCS4)-1;
4058 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004059 data = PyUnicode_DATA(unicode);
4060 kind = PyUnicode_KIND(unicode);
4061 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062}
4063
4064int
4065PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4066{
4067 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004068 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 return -1;
4070 }
Victor Stinner488fa492011-12-12 00:01:39 +01004071 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004072 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004073 PyErr_SetString(PyExc_IndexError, "string index out of range");
4074 return -1;
4075 }
Victor Stinner488fa492011-12-12 00:01:39 +01004076 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004077 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004078 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4079 PyErr_SetString(PyExc_ValueError, "character out of range");
4080 return -1;
4081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4083 index, ch);
4084 return 0;
4085}
4086
Alexander Belopolsky40018472011-02-26 01:02:56 +00004087const char *
4088PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004089{
Victor Stinner42cb4622010-09-01 19:39:01 +00004090 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004091}
4092
Victor Stinner554f3f02010-06-16 23:33:54 +00004093/* create or adjust a UnicodeDecodeError */
4094static void
4095make_decode_exception(PyObject **exceptionObject,
4096 const char *encoding,
4097 const char *input, Py_ssize_t length,
4098 Py_ssize_t startpos, Py_ssize_t endpos,
4099 const char *reason)
4100{
4101 if (*exceptionObject == NULL) {
4102 *exceptionObject = PyUnicodeDecodeError_Create(
4103 encoding, input, length, startpos, endpos, reason);
4104 }
4105 else {
4106 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4107 goto onError;
4108 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4109 goto onError;
4110 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4111 goto onError;
4112 }
4113 return;
4114
4115onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004116 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004117}
4118
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004119#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120/* error handling callback helper:
4121 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004122 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 and adjust various state variables.
4124 return 0 on success, -1 on error
4125*/
4126
Alexander Belopolsky40018472011-02-26 01:02:56 +00004127static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128unicode_decode_call_errorhandler_wchar(
4129 const char *errors, PyObject **errorHandler,
4130 const char *encoding, const char *reason,
4131 const char **input, const char **inend, Py_ssize_t *startinpos,
4132 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4133 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004135 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136
4137 PyObject *restuple = NULL;
4138 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004139 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004141 Py_ssize_t requiredsize;
4142 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004144 wchar_t *repwstr;
4145 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004147 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4148 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 *errorHandler = PyCodec_LookupError(errors);
4152 if (*errorHandler == NULL)
4153 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 }
4155
Victor Stinner554f3f02010-06-16 23:33:54 +00004156 make_decode_exception(exceptionObject,
4157 encoding,
4158 *input, *inend - *input,
4159 *startinpos, *endinpos,
4160 reason);
4161 if (*exceptionObject == NULL)
4162 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163
4164 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4165 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004168 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 }
4171 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004173
4174 /* Copy back the bytes variables, which might have been modified by the
4175 callback */
4176 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4177 if (!inputobj)
4178 goto onError;
4179 if (!PyBytes_Check(inputobj)) {
4180 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4181 }
4182 *input = PyBytes_AS_STRING(inputobj);
4183 insize = PyBytes_GET_SIZE(inputobj);
4184 *inend = *input + insize;
4185 /* we can DECREF safely, as the exception has another reference,
4186 so the object won't go away. */
4187 Py_DECREF(inputobj);
4188
4189 if (newpos<0)
4190 newpos = insize+newpos;
4191 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004192 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 goto onError;
4194 }
4195
4196 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4197 if (repwstr == NULL)
4198 goto onError;
4199 /* need more space? (at least enough for what we
4200 have+the replacement+the rest of the string (starting
4201 at the new input position), so we won't have to check space
4202 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004203 requiredsize = *outpos;
4204 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4205 goto overflow;
4206 requiredsize += repwlen;
4207 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4208 goto overflow;
4209 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004211 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004212 requiredsize = 2*outsize;
4213 if (unicode_resize(output, requiredsize) < 0)
4214 goto onError;
4215 }
4216 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4217 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218 *endinpos = newpos;
4219 *inptr = *input + newpos;
4220
4221 /* we made it! */
4222 Py_XDECREF(restuple);
4223 return 0;
4224
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004225 overflow:
4226 PyErr_SetString(PyExc_OverflowError,
4227 "decoded result is too long for a Python string");
4228
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229 onError:
4230 Py_XDECREF(restuple);
4231 return -1;
4232}
4233#endif /* HAVE_MBCS */
4234
4235static int
4236unicode_decode_call_errorhandler_writer(
4237 const char *errors, PyObject **errorHandler,
4238 const char *encoding, const char *reason,
4239 const char **input, const char **inend, Py_ssize_t *startinpos,
4240 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4241 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4242{
4243 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4244
4245 PyObject *restuple = NULL;
4246 PyObject *repunicode = NULL;
4247 Py_ssize_t insize;
4248 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004249 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250 PyObject *inputobj = NULL;
4251
4252 if (*errorHandler == NULL) {
4253 *errorHandler = PyCodec_LookupError(errors);
4254 if (*errorHandler == NULL)
4255 goto onError;
4256 }
4257
4258 make_decode_exception(exceptionObject,
4259 encoding,
4260 *input, *inend - *input,
4261 *startinpos, *endinpos,
4262 reason);
4263 if (*exceptionObject == NULL)
4264 goto onError;
4265
4266 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4267 if (restuple == NULL)
4268 goto onError;
4269 if (!PyTuple_Check(restuple)) {
4270 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4271 goto onError;
4272 }
4273 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004274 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004275
4276 /* Copy back the bytes variables, which might have been modified by the
4277 callback */
4278 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4279 if (!inputobj)
4280 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004281 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004283 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004284 *input = PyBytes_AS_STRING(inputobj);
4285 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004286 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004287 /* we can DECREF safely, as the exception has another reference,
4288 so the object won't go away. */
4289 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004293 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004294 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297
Victor Stinner8f674cc2013-04-17 23:02:17 +02004298 if (PyUnicode_READY(repunicode) < 0)
4299 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004300 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004301 if (replen > 1) {
4302 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004303 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004304 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4305 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4306 goto onError;
4307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004309 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004312 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 Py_XDECREF(restuple);
4316 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321}
4322
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323/* --- UTF-7 Codec -------------------------------------------------------- */
4324
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325/* See RFC2152 for details. We encode conservatively and decode liberally. */
4326
4327/* Three simple macros defining base-64. */
4328
4329/* Is c a base-64 character? */
4330
4331#define IS_BASE64(c) \
4332 (((c) >= 'A' && (c) <= 'Z') || \
4333 ((c) >= 'a' && (c) <= 'z') || \
4334 ((c) >= '0' && (c) <= '9') || \
4335 (c) == '+' || (c) == '/')
4336
4337/* given that c is a base-64 character, what is its base-64 value? */
4338
4339#define FROM_BASE64(c) \
4340 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4341 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4342 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4343 (c) == '+' ? 62 : 63)
4344
4345/* What is the base-64 character of the bottom 6 bits of n? */
4346
4347#define TO_BASE64(n) \
4348 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4349
4350/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4351 * decoded as itself. We are permissive on decoding; the only ASCII
4352 * byte not decoding to itself is the + which begins a base64
4353 * string. */
4354
4355#define DECODE_DIRECT(c) \
4356 ((c) <= 127 && (c) != '+')
4357
4358/* The UTF-7 encoder treats ASCII characters differently according to
4359 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4360 * the above). See RFC2152. This array identifies these different
4361 * sets:
4362 * 0 : "Set D"
4363 * alphanumeric and '(),-./:?
4364 * 1 : "Set O"
4365 * !"#$%&*;<=>@[]^_`{|}
4366 * 2 : "whitespace"
4367 * ht nl cr sp
4368 * 3 : special (must be base64 encoded)
4369 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4370 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371
Tim Petersced69f82003-09-16 20:30:58 +00004372static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373char utf7_category[128] = {
4374/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4375 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4376/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4377 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4378/* sp ! " # $ % & ' ( ) * + , - . / */
4379 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4380/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4382/* @ A B C D E F G H I J K L M N O */
4383 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4384/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4386/* ` a b c d e f g h i j k l m n o */
4387 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4388/* p q r s t u v w x y z { | } ~ del */
4389 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390};
4391
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392/* ENCODE_DIRECT: this character should be encoded as itself. The
4393 * answer depends on whether we are encoding set O as itself, and also
4394 * on whether we are encoding whitespace as itself. RFC2152 makes it
4395 * clear that the answers to these questions vary between
4396 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004397
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398#define ENCODE_DIRECT(c, directO, directWS) \
4399 ((c) < 128 && (c) > 0 && \
4400 ((utf7_category[(c)] == 0) || \
4401 (directWS && (utf7_category[(c)] == 2)) || \
4402 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403
Alexander Belopolsky40018472011-02-26 01:02:56 +00004404PyObject *
4405PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004406 Py_ssize_t size,
4407 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004409 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4410}
4411
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412/* The decoder. The only state we preserve is our read position,
4413 * i.e. how many characters we have consumed. So if we end in the
4414 * middle of a shift sequence we have to back off the read position
4415 * and the output to the beginning of the sequence, otherwise we lose
4416 * all the shift state (seen bits, number of bits seen, high
4417 * surrogate). */
4418
Alexander Belopolsky40018472011-02-26 01:02:56 +00004419PyObject *
4420PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004421 Py_ssize_t size,
4422 const char *errors,
4423 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 Py_ssize_t startinpos;
4427 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 const char *errmsg = "";
4431 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 unsigned int base64bits = 0;
4434 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004435 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 PyObject *errorHandler = NULL;
4437 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004439 if (size == 0) {
4440 if (consumed)
4441 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004442 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004446 _PyUnicodeWriter_Init(&writer);
4447 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448
4449 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450 e = s + size;
4451
4452 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004453 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004455 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 if (inShift) { /* in a base-64 section */
4458 if (IS_BASE64(ch)) { /* consume a base-64 character */
4459 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4460 base64bits += 6;
4461 s++;
4462 if (base64bits >= 16) {
4463 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004464 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 base64bits -= 16;
4466 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004467 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 if (surrogate) {
4469 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004470 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4471 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004472 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004475 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 }
4477 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004478 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004479 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481 }
4482 }
Victor Stinner551ac952011-11-29 22:58:13 +01004483 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 /* first surrogate */
4485 surrogate = outCh;
4486 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004488 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004489 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 }
4491 }
4492 }
4493 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 if (base64bits > 0) { /* left-over bits */
4496 if (base64bits >= 6) {
4497 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004498 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 errmsg = "partial character in shift sequence";
4500 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 else {
4503 /* Some bits remain; they should be zero */
4504 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004505 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 errmsg = "non-zero padding bits in shift sequence";
4507 goto utf7Error;
4508 }
4509 }
4510 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004511 if (surrogate && DECODE_DIRECT(ch)) {
4512 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4513 goto onError;
4514 }
4515 surrogate = 0;
4516 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 /* '-' is absorbed; other terminating
4518 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004519 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 }
4522 }
4523 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 s++; /* consume '+' */
4526 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004528 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004529 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530 }
4531 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004533 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004536 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 }
4538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004541 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004542 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 else {
4545 startinpos = s-starts;
4546 s++;
4547 errmsg = "unexpected special character";
4548 goto utf7Error;
4549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004553 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 errors, &errorHandler,
4555 "utf7", errmsg,
4556 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559 }
4560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* end of string */
4562
4563 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4564 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004565 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 if (surrogate ||
4567 (base64bits >= 6) ||
4568 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004570 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 errors, &errorHandler,
4572 "utf7", "unterminated shift sequence",
4573 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 goto onError;
4576 if (s < e)
4577 goto restart;
4578 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580
4581 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004582 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004584 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004585 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004586 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004587 writer.kind, writer.data, shiftOutStart);
4588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
4590 _PyUnicodeWriter_Dealloc(&writer);
4591 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004592 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004593 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 }
4595 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004596 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004598 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 Py_XDECREF(errorHandler);
4601 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004602 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 Py_XDECREF(errorHandler);
4606 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004607 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608 return NULL;
4609}
4610
4611
Alexander Belopolsky40018472011-02-26 01:02:56 +00004612PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004613_PyUnicode_EncodeUTF7(PyObject *str,
4614 int base64SetO,
4615 int base64WhiteSpace,
4616 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004618 int kind;
4619 void *data;
4620 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004623 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 unsigned int base64bits = 0;
4625 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 char * out;
4627 char * start;
4628
Benjamin Petersonbac79492012-01-14 13:34:47 -05004629 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004630 return NULL;
4631 kind = PyUnicode_KIND(str);
4632 data = PyUnicode_DATA(str);
4633 len = PyUnicode_GET_LENGTH(str);
4634
4635 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004638 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004639 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004640 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004641 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 if (v == NULL)
4643 return NULL;
4644
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004645 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004646 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004647 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 if (inShift) {
4650 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 /* shifting out */
4652 if (base64bits) { /* output remaining bits */
4653 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4654 base64buffer = 0;
4655 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 }
4657 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 /* Characters not in the BASE64 set implicitly unshift the sequence
4659 so no '-' is required, except if the character is itself a '-' */
4660 if (IS_BASE64(ch) || ch == '-') {
4661 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 *out++ = (char) ch;
4664 }
4665 else {
4666 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004667 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else { /* not in a shift sequence */
4670 if (ch == '+') {
4671 *out++ = '+';
4672 *out++ = '-';
4673 }
4674 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4675 *out++ = (char) ch;
4676 }
4677 else {
4678 *out++ = '+';
4679 inShift = 1;
4680 goto encode_char;
4681 }
4682 }
4683 continue;
4684encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004686 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 /* code first surrogate */
4689 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004690 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 while (base64bits >= 6) {
4692 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4693 base64bits -= 6;
4694 }
4695 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004696 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 base64bits += 16;
4699 base64buffer = (base64buffer << 16) | ch;
4700 while (base64bits >= 6) {
4701 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4702 base64bits -= 6;
4703 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004704 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705 if (base64bits)
4706 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4707 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004709 if (_PyBytes_Resize(&v, out - start) < 0)
4710 return NULL;
4711 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004713PyObject *
4714PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4715 Py_ssize_t size,
4716 int base64SetO,
4717 int base64WhiteSpace,
4718 const char *errors)
4719{
4720 PyObject *result;
4721 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4722 if (tmp == NULL)
4723 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004724 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004725 base64WhiteSpace, errors);
4726 Py_DECREF(tmp);
4727 return result;
4728}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730#undef IS_BASE64
4731#undef FROM_BASE64
4732#undef TO_BASE64
4733#undef DECODE_DIRECT
4734#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736/* --- UTF-8 Codec -------------------------------------------------------- */
4737
Alexander Belopolsky40018472011-02-26 01:02:56 +00004738PyObject *
4739PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004740 Py_ssize_t size,
4741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742{
Walter Dörwald69652032004-09-07 20:24:22 +00004743 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4744}
4745
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004746#include "stringlib/asciilib.h"
4747#include "stringlib/codecs.h"
4748#include "stringlib/undef.h"
4749
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004750#include "stringlib/ucs1lib.h"
4751#include "stringlib/codecs.h"
4752#include "stringlib/undef.h"
4753
4754#include "stringlib/ucs2lib.h"
4755#include "stringlib/codecs.h"
4756#include "stringlib/undef.h"
4757
4758#include "stringlib/ucs4lib.h"
4759#include "stringlib/codecs.h"
4760#include "stringlib/undef.h"
4761
Antoine Pitrouab868312009-01-10 15:40:25 +00004762/* Mask to quickly check whether a C 'long' contains a
4763 non-ASCII, UTF8-encoded char. */
4764#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004765# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004766#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004767# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004768#else
4769# error C 'long' size should be either 4 or 8!
4770#endif
4771
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772static Py_ssize_t
4773ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004774{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004776 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004778 /*
4779 * Issue #17237: m68k is a bit different from most architectures in
4780 * that objects do not use "natural alignment" - for example, int and
4781 * long are only aligned at 2-byte boundaries. Therefore the assert()
4782 * won't work; also, tests have shown that skipping the "optimised
4783 * version" will even speed up m68k.
4784 */
4785#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004787 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4788 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 /* Fast path, see in STRINGLIB(utf8_decode) for
4790 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004791 /* Help allocation */
4792 const char *_p = p;
4793 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(const unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 *((unsigned long *)q) = value;
4799 _p += SIZEOF_LONG;
4800 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004801 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 p = _p;
4803 while (p < end) {
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004810#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004811#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004812 while (p < end) {
4813 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4814 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004815 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004816 /* Help allocation */
4817 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 while (_p < aligned_end) {
4819 unsigned long value = *(unsigned long *) _p;
4820 if (value & ASCII_CHAR_MASK)
4821 break;
4822 _p += SIZEOF_LONG;
4823 }
4824 p = _p;
4825 if (_p == end)
4826 break;
4827 }
4828 if ((unsigned char)*p & 0x80)
4829 break;
4830 ++p;
4831 }
4832 memcpy(dest, start, p - start);
4833 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
Antoine Pitrouab868312009-01-10 15:40:25 +00004835
Victor Stinner785938e2011-12-11 20:09:03 +01004836PyObject *
4837PyUnicode_DecodeUTF8Stateful(const char *s,
4838 Py_ssize_t size,
4839 const char *errors,
4840 Py_ssize_t *consumed)
4841{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004842 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004843 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845
4846 Py_ssize_t startinpos;
4847 Py_ssize_t endinpos;
4848 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004849 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004851 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004852
4853 if (size == 0) {
4854 if (consumed)
4855 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004856 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004857 }
4858
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4860 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004861 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 *consumed = 1;
4863 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004864 }
4865
Victor Stinner8f674cc2013-04-17 23:02:17 +02004866 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004867 writer.min_length = size;
4868 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004869 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004870
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 writer.pos = ascii_decode(s, end, writer.data);
4872 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873 while (s < end) {
4874 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004875 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004876
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004878 if (PyUnicode_IS_ASCII(writer.buffer))
4879 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004881 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004883 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 } else {
4885 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 }
4888
4889 switch (ch) {
4890 case 0:
4891 if (s == end || consumed)
4892 goto End;
4893 errmsg = "unexpected end of data";
4894 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004895 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 break;
4897 case 1:
4898 errmsg = "invalid start byte";
4899 startinpos = s - starts;
4900 endinpos = startinpos + 1;
4901 break;
4902 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004903 case 3:
4904 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 errmsg = "invalid continuation byte";
4906 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004907 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 break;
4909 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004910 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 goto onError;
4912 continue;
4913 }
4914
Victor Stinner1d65d912015-10-05 13:43:50 +02004915 if (error_handler == _Py_ERROR_UNKNOWN)
4916 error_handler = get_error_handler(errors);
4917
4918 switch (error_handler) {
4919 case _Py_ERROR_IGNORE:
4920 s += (endinpos - startinpos);
4921 break;
4922
4923 case _Py_ERROR_REPLACE:
4924 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4925 goto onError;
4926 s += (endinpos - startinpos);
4927 break;
4928
4929 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004930 {
4931 Py_ssize_t i;
4932
Victor Stinner1d65d912015-10-05 13:43:50 +02004933 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4934 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004935 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004936 ch = (Py_UCS4)(unsigned char)(starts[i]);
4937 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4938 ch + 0xdc00);
4939 writer.pos++;
4940 }
4941 s += (endinpos - startinpos);
4942 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004943 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004944
4945 default:
4946 if (unicode_decode_call_errorhandler_writer(
4947 errors, &error_handler_obj,
4948 "utf-8", errmsg,
4949 &starts, &end, &startinpos, &endinpos, &exc, &s,
4950 &writer))
4951 goto onError;
4952 }
Victor Stinner785938e2011-12-11 20:09:03 +01004953 }
4954
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 if (consumed)
4957 *consumed = s - starts;
4958
Victor Stinner1d65d912015-10-05 13:43:50 +02004959 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004961 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962
4963onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004964 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004967 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004968}
4969
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004970#ifdef __APPLE__
4971
4972/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004973 used to decode the command line arguments on Mac OS X.
4974
4975 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004976 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004977
4978wchar_t*
4979_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4980{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004981 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 wchar_t *unicode;
4983 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984
4985 /* Note: size will always be longer than the resulting Unicode
4986 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004987 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004989 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990 if (!unicode)
4991 return NULL;
4992
4993 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005000#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 if (ch > 0xFF) {
5004#if SIZEOF_WCHAR_T == 4
5005 assert(0);
5006#else
5007 assert(Py_UNICODE_IS_SURROGATE(ch));
5008 /* compute and append the two surrogates: */
5009 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5010 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5011#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005012 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 else {
5014 if (!ch && s == e)
5015 break;
5016 /* surrogateescape */
5017 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5018 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005019 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005021 return unicode;
5022}
5023
5024#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005026/* Primary internal function which creates utf8 encoded bytes objects.
5027
5028 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005029 and allocate exactly as much space needed at the end. Else allocate the
5030 maximum possible needed (4 result bytes per Unicode character), and return
5031 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005032*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005033PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005034_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035{
Victor Stinner6099a032011-12-18 14:22:26 +01005036 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 void *data;
5038 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040 if (!PyUnicode_Check(unicode)) {
5041 PyErr_BadArgument();
5042 return NULL;
5043 }
5044
5045 if (PyUnicode_READY(unicode) == -1)
5046 return NULL;
5047
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005048 if (PyUnicode_UTF8(unicode))
5049 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5050 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005051
5052 kind = PyUnicode_KIND(unicode);
5053 data = PyUnicode_DATA(unicode);
5054 size = PyUnicode_GET_LENGTH(unicode);
5055
Benjamin Petersonead6b532011-12-20 17:23:42 -06005056 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005057 default:
5058 assert(0);
5059 case PyUnicode_1BYTE_KIND:
5060 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5061 assert(!PyUnicode_IS_ASCII(unicode));
5062 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5063 case PyUnicode_2BYTE_KIND:
5064 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5065 case PyUnicode_4BYTE_KIND:
5066 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068}
5069
Alexander Belopolsky40018472011-02-26 01:02:56 +00005070PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5072 Py_ssize_t size,
5073 const char *errors)
5074{
5075 PyObject *v, *unicode;
5076
5077 unicode = PyUnicode_FromUnicode(s, size);
5078 if (unicode == NULL)
5079 return NULL;
5080 v = _PyUnicode_AsUTF8String(unicode, errors);
5081 Py_DECREF(unicode);
5082 return v;
5083}
5084
5085PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005086PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005088 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089}
5090
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091/* --- UTF-32 Codec ------------------------------------------------------- */
5092
5093PyObject *
5094PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 Py_ssize_t size,
5096 const char *errors,
5097 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098{
5099 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5100}
5101
5102PyObject *
5103PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 Py_ssize_t size,
5105 const char *errors,
5106 int *byteorder,
5107 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108{
5109 const char *starts = s;
5110 Py_ssize_t startinpos;
5111 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005112 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005113 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005114 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005115 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 PyObject *errorHandler = NULL;
5118 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005119
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120 q = (unsigned char *)s;
5121 e = q + size;
5122
5123 if (byteorder)
5124 bo = *byteorder;
5125
5126 /* Check for BOM marks (U+FEFF) in the input and adjust current
5127 byte order setting accordingly. In native mode, the leading BOM
5128 mark is skipped, in all other modes, it is copied to the output
5129 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005130 if (bo == 0 && size >= 4) {
5131 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5132 if (bom == 0x0000FEFF) {
5133 bo = -1;
5134 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005136 else if (bom == 0xFFFE0000) {
5137 bo = 1;
5138 q += 4;
5139 }
5140 if (byteorder)
5141 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142 }
5143
Victor Stinnere64322e2012-10-30 23:12:47 +01005144 if (q == e) {
5145 if (consumed)
5146 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005147 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148 }
5149
Victor Stinnere64322e2012-10-30 23:12:47 +01005150#ifdef WORDS_BIGENDIAN
5151 le = bo < 0;
5152#else
5153 le = bo <= 0;
5154#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005155 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005156
Victor Stinner8f674cc2013-04-17 23:02:17 +02005157 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005158 writer.min_length = (e - q + 3) / 4;
5159 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005160 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005161
Victor Stinnere64322e2012-10-30 23:12:47 +01005162 while (1) {
5163 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005164 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005165
Victor Stinnere64322e2012-10-30 23:12:47 +01005166 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005167 enum PyUnicode_Kind kind = writer.kind;
5168 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005169 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005170 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005171 if (le) {
5172 do {
5173 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5174 if (ch > maxch)
5175 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 if (kind != PyUnicode_1BYTE_KIND &&
5177 Py_UNICODE_IS_SURROGATE(ch))
5178 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005179 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005180 q += 4;
5181 } while (q <= last);
5182 }
5183 else {
5184 do {
5185 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5186 if (ch > maxch)
5187 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005188 if (kind != PyUnicode_1BYTE_KIND &&
5189 Py_UNICODE_IS_SURROGATE(ch))
5190 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005191 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005192 q += 4;
5193 } while (q <= last);
5194 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005195 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005196 }
5197
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005198 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005199 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005200 startinpos = ((const char *)q) - starts;
5201 endinpos = startinpos + 4;
5202 }
5203 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005204 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005206 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005208 startinpos = ((const char *)q) - starts;
5209 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005211 else {
5212 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005213 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005214 goto onError;
5215 q += 4;
5216 continue;
5217 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005218 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005219 startinpos = ((const char *)q) - starts;
5220 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005222
5223 /* The remaining input chars are ignored if the callback
5224 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005225 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005227 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005229 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005231 }
5232
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 Py_XDECREF(errorHandler);
5237 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005238 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005239
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242 Py_XDECREF(errorHandler);
5243 Py_XDECREF(exc);
5244 return NULL;
5245}
5246
5247PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248_PyUnicode_EncodeUTF32(PyObject *str,
5249 const char *errors,
5250 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005252 enum PyUnicode_Kind kind;
5253 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005254 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005255 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005256 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005257#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005258 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005260 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005261#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005262 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005263 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005264 PyObject *errorHandler = NULL;
5265 PyObject *exc = NULL;
5266 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005268 if (!PyUnicode_Check(str)) {
5269 PyErr_BadArgument();
5270 return NULL;
5271 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005272 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005273 return NULL;
5274 kind = PyUnicode_KIND(str);
5275 data = PyUnicode_DATA(str);
5276 len = PyUnicode_GET_LENGTH(str);
5277
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005278 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005279 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005280 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005281 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282 if (v == NULL)
5283 return NULL;
5284
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005285 /* output buffer is 4-bytes aligned */
5286 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5287 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005289 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005290 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005291 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005293 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005294 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005295 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005296 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005297 else
5298 encoding = "utf-32";
5299
5300 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005301 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5302 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005303 }
5304
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005305 pos = 0;
5306 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005307 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005308
5309 if (kind == PyUnicode_2BYTE_KIND) {
5310 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5311 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005312 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005313 else {
5314 assert(kind == PyUnicode_4BYTE_KIND);
5315 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5316 &out, native_ordering);
5317 }
5318 if (pos == len)
5319 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005320
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005321 rep = unicode_encode_call_errorhandler(
5322 errors, &errorHandler,
5323 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005324 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 if (!rep)
5326 goto error;
5327
5328 if (PyBytes_Check(rep)) {
5329 repsize = PyBytes_GET_SIZE(rep);
5330 if (repsize & 3) {
5331 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005332 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 "surrogates not allowed");
5334 goto error;
5335 }
5336 moreunits = repsize / 4;
5337 }
5338 else {
5339 assert(PyUnicode_Check(rep));
5340 if (PyUnicode_READY(rep) < 0)
5341 goto error;
5342 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5343 if (!PyUnicode_IS_ASCII(rep)) {
5344 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005345 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005346 "surrogates not allowed");
5347 goto error;
5348 }
5349 }
5350
5351 /* four bytes are reserved for each surrogate */
5352 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005353 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 Py_ssize_t morebytes = 4 * (moreunits - 1);
5355 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5356 /* integer overflow */
5357 PyErr_NoMemory();
5358 goto error;
5359 }
5360 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5361 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005363 }
5364
5365 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005366 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5367 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005368 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005369 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5371 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005372 }
5373
5374 Py_CLEAR(rep);
5375 }
5376
5377 /* Cut back to size actually needed. This is necessary for, for example,
5378 encoding of a string containing isolated surrogates and the 'ignore'
5379 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005381 if (nsize != PyBytes_GET_SIZE(v))
5382 _PyBytes_Resize(&v, nsize);
5383 Py_XDECREF(errorHandler);
5384 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005386 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005387 error:
5388 Py_XDECREF(rep);
5389 Py_XDECREF(errorHandler);
5390 Py_XDECREF(exc);
5391 Py_XDECREF(v);
5392 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005393}
5394
Alexander Belopolsky40018472011-02-26 01:02:56 +00005395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5397 Py_ssize_t size,
5398 const char *errors,
5399 int byteorder)
5400{
5401 PyObject *result;
5402 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5403 if (tmp == NULL)
5404 return NULL;
5405 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5406 Py_DECREF(tmp);
5407 return result;
5408}
5409
5410PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005411PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005412{
Victor Stinnerb960b342011-11-20 19:12:52 +01005413 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005414}
5415
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416/* --- UTF-16 Codec ------------------------------------------------------- */
5417
Tim Peters772747b2001-08-09 22:21:55 +00005418PyObject *
5419PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 Py_ssize_t size,
5421 const char *errors,
5422 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423{
Walter Dörwald69652032004-09-07 20:24:22 +00005424 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5425}
5426
5427PyObject *
5428PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 Py_ssize_t size,
5430 const char *errors,
5431 int *byteorder,
5432 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435 Py_ssize_t startinpos;
5436 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005438 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005439 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005440 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005441 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 PyObject *errorHandler = NULL;
5443 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
Tim Peters772747b2001-08-09 22:21:55 +00005446 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005447 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448
5449 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005450 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005452 /* Check for BOM marks (U+FEFF) in the input and adjust current
5453 byte order setting accordingly. In native mode, the leading BOM
5454 mark is skipped, in all other modes, it is copied to the output
5455 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005456 if (bo == 0 && size >= 2) {
5457 const Py_UCS4 bom = (q[1] << 8) | q[0];
5458 if (bom == 0xFEFF) {
5459 q += 2;
5460 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005462 else if (bom == 0xFFFE) {
5463 q += 2;
5464 bo = 1;
5465 }
5466 if (byteorder)
5467 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
Antoine Pitrou63065d72012-05-15 23:48:04 +02005470 if (q == e) {
5471 if (consumed)
5472 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005473 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005474 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005475
Christian Heimes743e0cd2012-10-17 23:52:17 +02005476#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005477 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005479#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005480 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005482#endif
Tim Peters772747b2001-08-09 22:21:55 +00005483
Antoine Pitrou63065d72012-05-15 23:48:04 +02005484 /* Note: size will always be longer than the resulting Unicode
5485 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005486 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005487 writer.min_length = (e - q + 1) / 2;
5488 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005489 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005490
Antoine Pitrou63065d72012-05-15 23:48:04 +02005491 while (1) {
5492 Py_UCS4 ch = 0;
5493 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005494 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005496 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005497 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005498 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005499 native_ordering);
5500 else
5501 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005502 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005503 native_ordering);
5504 } else if (kind == PyUnicode_2BYTE_KIND) {
5505 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005506 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005507 native_ordering);
5508 } else {
5509 assert(kind == PyUnicode_4BYTE_KIND);
5510 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005511 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005512 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005513 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005514 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515
Antoine Pitrou63065d72012-05-15 23:48:04 +02005516 switch (ch)
5517 {
5518 case 0:
5519 /* remaining byte at the end? (size should be even) */
5520 if (q == e || consumed)
5521 goto End;
5522 errmsg = "truncated data";
5523 startinpos = ((const char *)q) - starts;
5524 endinpos = ((const char *)e) - starts;
5525 break;
5526 /* The remaining input chars are ignored if the callback
5527 chooses to skip the input */
5528 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005529 q -= 2;
5530 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005531 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005532 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005533 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005534 endinpos = ((const char *)e) - starts;
5535 break;
5536 case 2:
5537 errmsg = "illegal encoding";
5538 startinpos = ((const char *)q) - 2 - starts;
5539 endinpos = startinpos + 2;
5540 break;
5541 case 3:
5542 errmsg = "illegal UTF-16 surrogate";
5543 startinpos = ((const char *)q) - 4 - starts;
5544 endinpos = startinpos + 2;
5545 break;
5546 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005547 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005548 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 continue;
5550 }
5551
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005552 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005553 errors,
5554 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005556 &starts,
5557 (const char **)&e,
5558 &startinpos,
5559 &endinpos,
5560 &exc,
5561 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005562 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 }
5565
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566End:
Walter Dörwald69652032004-09-07 20:24:22 +00005567 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 Py_XDECREF(errorHandler);
5571 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005572 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 Py_XDECREF(errorHandler);
5577 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 return NULL;
5579}
5580
Tim Peters772747b2001-08-09 22:21:55 +00005581PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005582_PyUnicode_EncodeUTF16(PyObject *str,
5583 const char *errors,
5584 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005586 enum PyUnicode_Kind kind;
5587 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005588 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005589 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005590 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005591 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005592#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005593 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005594#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005595 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005596#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 const char *encoding;
5598 Py_ssize_t nsize, pos;
5599 PyObject *errorHandler = NULL;
5600 PyObject *exc = NULL;
5601 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005602
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603 if (!PyUnicode_Check(str)) {
5604 PyErr_BadArgument();
5605 return NULL;
5606 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005607 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005608 return NULL;
5609 kind = PyUnicode_KIND(str);
5610 data = PyUnicode_DATA(str);
5611 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005612
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005613 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005614 if (kind == PyUnicode_4BYTE_KIND) {
5615 const Py_UCS4 *in = (const Py_UCS4 *)data;
5616 const Py_UCS4 *end = in + len;
5617 while (in < end)
5618 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005620 }
5621 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005623 nsize = len + pairs + (byteorder == 0);
5624 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 if (v == NULL)
5626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005628 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005629 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005630 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005632 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005634 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005635
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005636 if (kind == PyUnicode_1BYTE_KIND) {
5637 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5638 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005639 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005640
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005641 if (byteorder < 0)
5642 encoding = "utf-16-le";
5643 else if (byteorder > 0)
5644 encoding = "utf-16-be";
5645 else
5646 encoding = "utf-16";
5647
5648 pos = 0;
5649 while (pos < len) {
5650 Py_ssize_t repsize, moreunits;
5651
5652 if (kind == PyUnicode_2BYTE_KIND) {
5653 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5654 &out, native_ordering);
5655 }
5656 else {
5657 assert(kind == PyUnicode_4BYTE_KIND);
5658 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5659 &out, native_ordering);
5660 }
5661 if (pos == len)
5662 break;
5663
5664 rep = unicode_encode_call_errorhandler(
5665 errors, &errorHandler,
5666 encoding, "surrogates not allowed",
5667 str, &exc, pos, pos + 1, &pos);
5668 if (!rep)
5669 goto error;
5670
5671 if (PyBytes_Check(rep)) {
5672 repsize = PyBytes_GET_SIZE(rep);
5673 if (repsize & 1) {
5674 raise_encode_exception(&exc, encoding,
5675 str, pos - 1, pos,
5676 "surrogates not allowed");
5677 goto error;
5678 }
5679 moreunits = repsize / 2;
5680 }
5681 else {
5682 assert(PyUnicode_Check(rep));
5683 if (PyUnicode_READY(rep) < 0)
5684 goto error;
5685 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5686 if (!PyUnicode_IS_ASCII(rep)) {
5687 raise_encode_exception(&exc, encoding,
5688 str, pos - 1, pos,
5689 "surrogates not allowed");
5690 goto error;
5691 }
5692 }
5693
5694 /* two bytes are reserved for each surrogate */
5695 if (moreunits > 1) {
5696 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5697 Py_ssize_t morebytes = 2 * (moreunits - 1);
5698 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5699 /* integer overflow */
5700 PyErr_NoMemory();
5701 goto error;
5702 }
5703 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5704 goto error;
5705 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5706 }
5707
5708 if (PyBytes_Check(rep)) {
5709 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5710 out += moreunits;
5711 } else /* rep is unicode */ {
5712 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5713 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5714 &out, native_ordering);
5715 }
5716
5717 Py_CLEAR(rep);
5718 }
5719
5720 /* Cut back to size actually needed. This is necessary for, for example,
5721 encoding of a string containing isolated surrogates and the 'ignore' handler
5722 is used. */
5723 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5724 if (nsize != PyBytes_GET_SIZE(v))
5725 _PyBytes_Resize(&v, nsize);
5726 Py_XDECREF(errorHandler);
5727 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005728 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005729 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005730 error:
5731 Py_XDECREF(rep);
5732 Py_XDECREF(errorHandler);
5733 Py_XDECREF(exc);
5734 Py_XDECREF(v);
5735 return NULL;
5736#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737}
5738
Alexander Belopolsky40018472011-02-26 01:02:56 +00005739PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5741 Py_ssize_t size,
5742 const char *errors,
5743 int byteorder)
5744{
5745 PyObject *result;
5746 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5747 if (tmp == NULL)
5748 return NULL;
5749 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5750 Py_DECREF(tmp);
5751 return result;
5752}
5753
5754PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005755PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005757 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
5760/* --- Unicode Escape Codec ----------------------------------------------- */
5761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5763 if all the escapes in the string make it still a valid ASCII string.
5764 Returns -1 if any escapes were found which cause the string to
5765 pop out of ASCII range. Otherwise returns the length of the
5766 required buffer to hold the string.
5767 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005768static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5770{
5771 const unsigned char *p = (const unsigned char *)s;
5772 const unsigned char *end = p + size;
5773 Py_ssize_t length = 0;
5774
5775 if (size < 0)
5776 return -1;
5777
5778 for (; p < end; ++p) {
5779 if (*p > 127) {
5780 /* Non-ASCII */
5781 return -1;
5782 }
5783 else if (*p != '\\') {
5784 /* Normal character */
5785 ++length;
5786 }
5787 else {
5788 /* Backslash-escape, check next char */
5789 ++p;
5790 /* Escape sequence reaches till end of string or
5791 non-ASCII follow-up. */
5792 if (p >= end || *p > 127)
5793 return -1;
5794 switch (*p) {
5795 case '\n':
5796 /* backslash + \n result in zero characters */
5797 break;
5798 case '\\': case '\'': case '\"':
5799 case 'b': case 'f': case 't':
5800 case 'n': case 'r': case 'v': case 'a':
5801 ++length;
5802 break;
5803 case '0': case '1': case '2': case '3':
5804 case '4': case '5': case '6': case '7':
5805 case 'x': case 'u': case 'U': case 'N':
5806 /* these do not guarantee ASCII characters */
5807 return -1;
5808 default:
5809 /* count the backslash + the other character */
5810 length += 2;
5811 }
5812 }
5813 }
5814 return length;
5815}
5816
Fredrik Lundh06d12682001-01-24 07:59:11 +00005817static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005818
Alexander Belopolsky40018472011-02-26 01:02:56 +00005819PyObject *
5820PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005821 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005825 Py_ssize_t startinpos;
5826 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005827 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 char* message;
5830 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 PyObject *errorHandler = NULL;
5832 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005833 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005834
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005835 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005836 if (len == 0)
5837 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838
5839 /* After length_of_escaped_ascii_string() there are two alternatives,
5840 either the string is pure ASCII with named escapes like \n, etc.
5841 and we determined it's exact size (common case)
5842 or it contains \x, \u, ... escape sequences. then we create a
5843 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005844 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005846 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005847 }
5848 else {
5849 /* Escaped strings will always be longer than the resulting
5850 Unicode string, so we start with size here and then reduce the
5851 length after conversion to the true value.
5852 (but if the error callback returns a long replacement string
5853 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005854 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855 }
5856
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 while (s < end) {
5862 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005863 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
5866 /* Non-escape characters are interpreted as Unicode ordinals */
5867 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868 x = (unsigned char)*s;
5869 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005870 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005871 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 continue;
5873 }
5874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 /* \ - Escapes */
5877 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005878 c = *s++;
5879 if (s > end)
5880 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005882 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005885#define WRITECHAR(ch) \
5886 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005887 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005888 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 case '\\': WRITECHAR('\\'); break;
5893 case '\'': WRITECHAR('\''); break;
5894 case '\"': WRITECHAR('\"'); break;
5895 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005897 case 'f': WRITECHAR('\014'); break;
5898 case 't': WRITECHAR('\t'); break;
5899 case 'n': WRITECHAR('\n'); break;
5900 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005901 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005902 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005903 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 case '0': case '1': case '2': case '3':
5908 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005909 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005910 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005911 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005912 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005913 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005915 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 break;
5917
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 /* hex escapes */
5919 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005921 digits = 2;
5922 message = "truncated \\xXX escape";
5923 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005927 digits = 4;
5928 message = "truncated \\uXXXX escape";
5929 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005932 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005933 digits = 8;
5934 message = "truncated \\UXXXXXXXX escape";
5935 hexescape:
5936 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005937 if (end - s < digits) {
5938 /* count only hex digits */
5939 for (; s < end; ++s) {
5940 c = (unsigned char)*s;
5941 if (!Py_ISXDIGIT(c))
5942 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005943 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005944 goto error;
5945 }
5946 for (; digits--; ++s) {
5947 c = (unsigned char)*s;
5948 if (!Py_ISXDIGIT(c))
5949 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005950 chr = (chr<<4) & ~0xF;
5951 if (c >= '0' && c <= '9')
5952 chr += c - '0';
5953 else if (c >= 'a' && c <= 'f')
5954 chr += 10 + c - 'a';
5955 else
5956 chr += 10 + c - 'A';
5957 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 /* _decoding_error will have already written into the
5960 target buffer. */
5961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005963 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005964 message = "illegal Unicode character";
5965 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005966 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005967 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005968 break;
5969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971 case 'N':
5972 message = "malformed \\N character escape";
5973 if (ucnhash_CAPI == NULL) {
5974 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5976 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005977 if (ucnhash_CAPI == NULL)
5978 goto ucnhashError;
5979 }
5980 if (*s == '{') {
5981 const char *start = s+1;
5982 /* look for the closing brace */
5983 while (*s != '}' && s < end)
5984 s++;
5985 if (s > start && s < end && *s == '}') {
5986 /* found a name. look it up in the unicode database */
5987 message = "unknown Unicode character name";
5988 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005989 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005990 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005991 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 goto store;
5993 }
5994 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005995 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005996
5997 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005998 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999 message = "\\ at end of string";
6000 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006001 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006002 }
6003 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006004 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006005 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006006 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006009 continue;
6010
6011 error:
6012 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006013 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006014 errors, &errorHandler,
6015 "unicodeescape", message,
6016 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006017 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006018 goto onError;
6019 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006021#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006022
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006025 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006028 PyErr_SetString(
6029 PyExc_UnicodeError,
6030 "\\N escapes not supported (can't load unicodedata module)"
6031 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 Py_XDECREF(errorHandler);
6034 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006035 return NULL;
6036
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006038 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return NULL;
6042}
6043
6044/* Return a Unicode-Escape string version of the Unicode object.
6045
6046 If quotes is true, the string is enclosed in u"" or u'' quotes as
6047 appropriate.
6048
6049*/
6050
Alexander Belopolsky40018472011-02-26 01:02:56 +00006051PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006055 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 int kind;
6058 void *data;
6059 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
Ezio Melottie7f90372012-10-05 03:33:31 +03006061 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006062 escape.
6063
Ezio Melottie7f90372012-10-05 03:33:31 +03006064 For UCS1 strings it's '\xxx', 4 bytes per source character.
6065 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6066 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006067 */
6068
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006069 if (!PyUnicode_Check(unicode)) {
6070 PyErr_BadArgument();
6071 return NULL;
6072 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006073 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006074 return NULL;
6075 len = PyUnicode_GET_LENGTH(unicode);
6076 kind = PyUnicode_KIND(unicode);
6077 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006078 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006079 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6080 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6081 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6082 }
6083
6084 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006085 return PyBytes_FromStringAndSize(NULL, 0);
6086
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006087 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006090 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006092 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 if (repr == NULL)
6095 return NULL;
6096
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006097 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006099 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006100 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006101
Walter Dörwald79e913e2007-05-12 11:08:06 +00006102 /* Escape backslashes */
6103 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 *p++ = '\\';
6105 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006106 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006107 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006108
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006109 /* Map 21-bit characters to '\U00xxxxxx' */
6110 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006111 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006112 *p++ = '\\';
6113 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006114 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6115 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6116 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6117 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6118 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6119 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6120 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6121 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006123 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006126 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 *p++ = '\\';
6128 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006129 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6130 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6131 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6132 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006134
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006135 /* Map special whitespace to '\t', \n', '\r' */
6136 else if (ch == '\t') {
6137 *p++ = '\\';
6138 *p++ = 't';
6139 }
6140 else if (ch == '\n') {
6141 *p++ = '\\';
6142 *p++ = 'n';
6143 }
6144 else if (ch == '\r') {
6145 *p++ = '\\';
6146 *p++ = 'r';
6147 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006148
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006149 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006150 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006152 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006153 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6154 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006156
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 /* Copy everything else as-is */
6158 else
6159 *p++ = (char) ch;
6160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006162 assert(p - PyBytes_AS_STRING(repr) > 0);
6163 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6164 return NULL;
6165 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166}
6167
Alexander Belopolsky40018472011-02-26 01:02:56 +00006168PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6170 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 PyObject *result;
6173 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6174 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 result = PyUnicode_AsUnicodeEscapeString(tmp);
6177 Py_DECREF(tmp);
6178 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179}
6180
6181/* --- Raw Unicode Escape Codec ------------------------------------------- */
6182
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183PyObject *
6184PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006185 Py_ssize_t size,
6186 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006189 Py_ssize_t startinpos;
6190 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006191 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 const char *end;
6193 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006194 PyObject *errorHandler = NULL;
6195 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006196
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006197 if (size == 0)
6198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006199
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 /* Escaped strings will always be longer than the resulting
6201 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 length after conversion to the true value. (But decoding error
6203 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006204 _PyUnicodeWriter_Init(&writer);
6205 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 end = s + size;
6208 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 unsigned char c;
6210 Py_UCS4 x;
6211 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006212 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* Non-escape characters are interpreted as Unicode ordinals */
6215 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006216 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006217 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006218 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 startinpos = s-starts;
6222
6223 /* \u-escapes are only interpreted iff the number of leading
6224 backslashes if odd */
6225 bs = s;
6226 for (;s < end;) {
6227 if (*s != '\\')
6228 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006229 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006230 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006231 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 }
6233 if (((s - bs) & 1) == 0 ||
6234 s >= end ||
6235 (*s != 'u' && *s != 'U')) {
6236 continue;
6237 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006238 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 count = *s=='u' ? 4 : 8;
6240 s++;
6241
6242 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 for (x = 0, i = 0; i < count; ++i, ++s) {
6244 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006245 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006247 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 errors, &errorHandler,
6249 "rawunicodeescape", "truncated \\uXXXX",
6250 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006251 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 goto onError;
6253 goto nextByte;
6254 }
6255 x = (x<<4) & ~0xF;
6256 if (c >= '0' && c <= '9')
6257 x += c - '0';
6258 else if (c >= 'a' && c <= 'f')
6259 x += 10 + c - 'a';
6260 else
6261 x += 10 + c - 'A';
6262 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006263 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006264 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006266 }
6267 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006268 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006269 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006270 errors, &errorHandler,
6271 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006273 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006275 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 nextByte:
6277 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 Py_XDECREF(errorHandler);
6280 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006281 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006282
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006284 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 Py_XDECREF(errorHandler);
6286 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 return NULL;
6288}
6289
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006290
Alexander Belopolsky40018472011-02-26 01:02:56 +00006291PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006292PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006294 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 char *p;
6296 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006297 Py_ssize_t expandsize, pos;
6298 int kind;
6299 void *data;
6300 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006302 if (!PyUnicode_Check(unicode)) {
6303 PyErr_BadArgument();
6304 return NULL;
6305 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006306 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006307 return NULL;
6308 kind = PyUnicode_KIND(unicode);
6309 data = PyUnicode_DATA(unicode);
6310 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006311 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6312 bytes, and 1 byte characters 4. */
6313 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006314
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006315 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006317
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 if (repr == NULL)
6320 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006321 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006322 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006324 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325 for (pos = 0; pos < len; pos++) {
6326 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 /* Map 32-bit characters to '\Uxxxxxxxx' */
6328 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006329 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006330 *p++ = '\\';
6331 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006332 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6333 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6334 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6335 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6336 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6337 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6338 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6339 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006340 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 *p++ = '\\';
6344 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006345 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6346 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6347 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6348 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 /* Copy everything else as-is */
6351 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 *p++ = (char) ch;
6353 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006354
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006355 assert(p > q);
6356 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006357 return NULL;
6358 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359}
6360
Alexander Belopolsky40018472011-02-26 01:02:56 +00006361PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006362PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6363 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006365 PyObject *result;
6366 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6367 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006368 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006369 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6370 Py_DECREF(tmp);
6371 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372}
6373
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374/* --- Unicode Internal Codec ------------------------------------------- */
6375
Alexander Belopolsky40018472011-02-26 01:02:56 +00006376PyObject *
6377_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006378 Py_ssize_t size,
6379 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006380{
6381 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382 Py_ssize_t startinpos;
6383 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006385 const char *end;
6386 const char *reason;
6387 PyObject *errorHandler = NULL;
6388 PyObject *exc = NULL;
6389
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006390 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006391 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006392 1))
6393 return NULL;
6394
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006395 if (size == 0)
6396 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006397
Victor Stinner8f674cc2013-04-17 23:02:17 +02006398 _PyUnicodeWriter_Init(&writer);
6399 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6400 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006402 }
6403 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006404
Victor Stinner8f674cc2013-04-17 23:02:17 +02006405 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006406 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006407 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006408 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006409 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006410 endinpos = end-starts;
6411 reason = "truncated input";
6412 goto error;
6413 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006414 /* We copy the raw representation one byte at a time because the
6415 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006416 ((char *) &uch)[0] = s[0];
6417 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006418#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006419 ((char *) &uch)[2] = s[2];
6420 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006421#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006422 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006423#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006424 /* We have to sanity check the raw data, otherwise doom looms for
6425 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006426 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006427 endinpos = s - starts + Py_UNICODE_SIZE;
6428 reason = "illegal code point (> 0x10FFFF)";
6429 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006430 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006431#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006432 s += Py_UNICODE_SIZE;
6433#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006434 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006435 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006436 Py_UNICODE uch2;
6437 ((char *) &uch2)[0] = s[0];
6438 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006439 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006440 {
Victor Stinner551ac952011-11-29 22:58:13 +01006441 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006442 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006443 }
6444 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006445#endif
6446
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006447 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006448 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006449 continue;
6450
6451 error:
6452 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006453 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006454 errors, &errorHandler,
6455 "unicode_internal", reason,
6456 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006457 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006458 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006459 }
6460
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006461 Py_XDECREF(errorHandler);
6462 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006463 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006464
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006466 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467 Py_XDECREF(errorHandler);
6468 Py_XDECREF(exc);
6469 return NULL;
6470}
6471
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472/* --- Latin-1 Codec ------------------------------------------------------ */
6473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
6475PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006476 Py_ssize_t size,
6477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006480 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481}
6482
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006484static void
6485make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006486 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006487 PyObject *unicode,
6488 Py_ssize_t startpos, Py_ssize_t endpos,
6489 const char *reason)
6490{
6491 if (*exceptionObject == NULL) {
6492 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006494 encoding, unicode, startpos, endpos, reason);
6495 }
6496 else {
6497 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6498 goto onError;
6499 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6500 goto onError;
6501 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6502 goto onError;
6503 return;
6504 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006505 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006506 }
6507}
6508
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006509/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006510static void
6511raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006512 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006513 PyObject *unicode,
6514 Py_ssize_t startpos, Py_ssize_t endpos,
6515 const char *reason)
6516{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006517 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006518 encoding, unicode, startpos, endpos, reason);
6519 if (*exceptionObject != NULL)
6520 PyCodec_StrictErrors(*exceptionObject);
6521}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522
6523/* error handling callback helper:
6524 build arguments, call the callback and check the arguments,
6525 put the result into newpos and return the replacement string, which
6526 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527static PyObject *
6528unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006529 PyObject **errorHandler,
6530 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006531 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006532 Py_ssize_t startpos, Py_ssize_t endpos,
6533 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006535 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006536 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006537 PyObject *restuple;
6538 PyObject *resunicode;
6539
6540 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006542 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 }
6545
Benjamin Petersonbac79492012-01-14 13:34:47 -05006546 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 return NULL;
6548 len = PyUnicode_GET_LENGTH(unicode);
6549
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006550 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554
6555 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006557 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006560 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 Py_DECREF(restuple);
6562 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006564 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 &resunicode, newpos)) {
6566 Py_DECREF(restuple);
6567 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006568 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006569 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6570 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6571 Py_DECREF(restuple);
6572 return NULL;
6573 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 *newpos = len + *newpos;
6576 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006577 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 Py_DECREF(restuple);
6579 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006580 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006581 Py_INCREF(resunicode);
6582 Py_DECREF(restuple);
6583 return resunicode;
6584}
6585
Alexander Belopolsky40018472011-02-26 01:02:56 +00006586static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006588 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006589 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591 /* input state */
6592 Py_ssize_t pos=0, size;
6593 int kind;
6594 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 /* pointer into the output */
6596 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006597 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6598 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006599 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006601 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006602 /* output object */
6603 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604
Benjamin Petersonbac79492012-01-14 13:34:47 -05006605 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 return NULL;
6607 size = PyUnicode_GET_LENGTH(unicode);
6608 kind = PyUnicode_KIND(unicode);
6609 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610 /* allocate enough for a simple encoding without
6611 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006612 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006613 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006614
6615 _PyBytesWriter_Init(&writer);
6616 str = _PyBytesWriter_Alloc(&writer, size);
6617 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006618 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006619
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006621 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006624 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006626 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006628 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 PyObject *repunicode;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006631 Py_ssize_t repsize, newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006634 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006636
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006637 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006639
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006640 /* Only overallocate the buffer if it's not the last write */
6641 writer.overallocate = (collend < size);
6642
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006644 if (error_handler == _Py_ERROR_UNKNOWN)
6645 error_handler = get_error_handler(errors);
6646
6647 switch (error_handler) {
6648 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006649 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006651
6652 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006653 memset(str, '?', collend - collstart);
6654 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006655 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006656 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 break;
Victor Stinner50149202015-09-22 00:26:54 +02006659
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006660 case _Py_ERROR_BACKSLASHREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006661 /* substract preallocated bytes */
6662 writer.min_size -= (collend - collstart);
6663 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006664 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006665 if (str == NULL)
6666 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006667 pos = collend;
6668 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006669
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006670 case _Py_ERROR_XMLCHARREFREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006671 /* substract preallocated bytes */
6672 writer.min_size -= (collend - collstart);
6673 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006674 unicode, collstart, collend);
6675 if (str == NULL)
6676 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 break;
Victor Stinner50149202015-09-22 00:26:54 +02006679
Victor Stinnerc3713e92015-09-29 12:32:13 +02006680 case _Py_ERROR_SURROGATEESCAPE:
6681 for (i = collstart; i < collend; ++i) {
6682 ch = PyUnicode_READ(kind, data, i);
6683 if (ch < 0xdc80 || 0xdcff < ch) {
6684 /* Not a UTF-8b surrogate */
6685 break;
6686 }
6687 *str++ = (char)(ch - 0xdc00);
6688 ++pos;
6689 }
6690 if (i >= collend)
6691 break;
6692 collstart = pos;
6693 assert(collstart != collend);
6694 /* fallback to general error handling */
6695
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 default:
Victor Stinner50149202015-09-22 00:26:54 +02006697 repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698 encoding, reason, unicode, &exc,
6699 collstart, collend, &newpos);
6700 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006701 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006703
Victor Stinnerad771582015-10-09 12:38:53 +02006704 /* substract preallocated bytes */
6705 writer.min_size -= 1;
6706
Martin v. Löwis011e8422009-05-05 04:43:17 +00006707 if (PyBytes_Check(repunicode)) {
6708 /* Directly copy bytes result to output. */
6709 repsize = PyBytes_Size(repunicode);
Victor Stinnerad771582015-10-09 12:38:53 +02006710
6711 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6712 if (str == NULL)
6713 goto onError;
6714
Martin v. Löwis011e8422009-05-05 04:43:17 +00006715 memcpy(str, PyBytes_AsString(repunicode), repsize);
6716 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006718 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006719 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006720 }
Victor Stinner0030cd52015-09-24 14:45:00 +02006721
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 /* need more space? (at least enough for what we
6723 have+the replacement+the rest of the string, so
6724 we won't have to check space for encodable characters) */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006725 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerad771582015-10-09 12:38:53 +02006726
6727 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6728 if (str == NULL)
6729 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006730
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 /* check if there is anything unencodable in the replacement
6732 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 for (i = 0; repsize-->0; ++i, ++str) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006734 ch = PyUnicode_READ_CHAR(repunicode, i);
6735 if (ch >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006736 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006737 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 Py_DECREF(repunicode);
6739 goto onError;
6740 }
Victor Stinner0030cd52015-09-24 14:45:00 +02006741 *str = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006743 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006744 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006745 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006746
6747 /* If overallocation was disabled, ensure that it was the last
6748 write. Otherwise, we missed an optimization */
6749 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006750 }
6751 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006752
Victor Stinner50149202015-09-22 00:26:54 +02006753 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006755 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006756
6757 onError:
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006758 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006759 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006760 Py_XDECREF(exc);
6761 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762}
6763
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006765PyObject *
6766PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006767 Py_ssize_t size,
6768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770 PyObject *result;
6771 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6772 if (unicode == NULL)
6773 return NULL;
6774 result = unicode_encode_ucs1(unicode, errors, 256);
6775 Py_DECREF(unicode);
6776 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777}
6778
Alexander Belopolsky40018472011-02-26 01:02:56 +00006779PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781{
6782 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 PyErr_BadArgument();
6784 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006786 if (PyUnicode_READY(unicode) == -1)
6787 return NULL;
6788 /* Fast path: if it is a one-byte string, construct
6789 bytes object directly. */
6790 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6791 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6792 PyUnicode_GET_LENGTH(unicode));
6793 /* Non-Latin-1 characters present. Defer to above function to
6794 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006795 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006796}
6797
6798PyObject*
6799PyUnicode_AsLatin1String(PyObject *unicode)
6800{
6801 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802}
6803
6804/* --- 7-bit ASCII Codec -------------------------------------------------- */
6805
Alexander Belopolsky40018472011-02-26 01:02:56 +00006806PyObject *
6807PyUnicode_DecodeASCII(const char *s,
6808 Py_ssize_t size,
6809 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006811 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006812 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006813 int kind;
6814 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 Py_ssize_t startinpos;
6816 Py_ssize_t endinpos;
6817 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006818 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006819 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006821 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006822
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006824 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006825
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006827 if (size == 1 && (unsigned char)s[0] < 128)
6828 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006829
Victor Stinner8f674cc2013-04-17 23:02:17 +02006830 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006831 writer.min_length = size;
6832 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006833 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006836 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006837 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006838 writer.pos = outpos;
6839 if (writer.pos == size)
6840 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006841
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006842 s += writer.pos;
6843 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006845 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006847 PyUnicode_WRITE(kind, data, writer.pos, c);
6848 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006850 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006852
6853 /* byte outsize range 0x00..0x7f: call the error handler */
6854
6855 if (error_handler == _Py_ERROR_UNKNOWN)
6856 error_handler = get_error_handler(errors);
6857
6858 switch (error_handler)
6859 {
6860 case _Py_ERROR_REPLACE:
6861 case _Py_ERROR_SURROGATEESCAPE:
6862 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006863 but we may switch to UCS2 at the first write */
6864 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6865 goto onError;
6866 kind = writer.kind;
6867 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006868
6869 if (error_handler == _Py_ERROR_REPLACE)
6870 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6871 else
6872 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6873 writer.pos++;
6874 ++s;
6875 break;
6876
6877 case _Py_ERROR_IGNORE:
6878 ++s;
6879 break;
6880
6881 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 startinpos = s-starts;
6883 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006884 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006885 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 "ascii", "ordinal not in range(128)",
6887 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006888 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006890 kind = writer.kind;
6891 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006894 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006896 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006897
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006899 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006900 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 return NULL;
6903}
6904
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006905/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006906PyObject *
6907PyUnicode_EncodeASCII(const Py_UNICODE *p,
6908 Py_ssize_t size,
6909 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006911 PyObject *result;
6912 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6913 if (unicode == NULL)
6914 return NULL;
6915 result = unicode_encode_ucs1(unicode, errors, 128);
6916 Py_DECREF(unicode);
6917 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
Alexander Belopolsky40018472011-02-26 01:02:56 +00006920PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006921_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922{
6923 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 PyErr_BadArgument();
6925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006927 if (PyUnicode_READY(unicode) == -1)
6928 return NULL;
6929 /* Fast path: if it is an ASCII-only string, construct bytes object
6930 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006931 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006932 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6933 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006934 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006935}
6936
6937PyObject *
6938PyUnicode_AsASCIIString(PyObject *unicode)
6939{
6940 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941}
6942
Victor Stinner99b95382011-07-04 14:23:54 +02006943#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006944
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006945/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006946
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006947#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948#define NEED_RETRY
6949#endif
6950
Victor Stinner3a50e702011-10-18 21:21:00 +02006951#ifndef WC_ERR_INVALID_CHARS
6952# define WC_ERR_INVALID_CHARS 0x0080
6953#endif
6954
6955static char*
6956code_page_name(UINT code_page, PyObject **obj)
6957{
6958 *obj = NULL;
6959 if (code_page == CP_ACP)
6960 return "mbcs";
6961 if (code_page == CP_UTF7)
6962 return "CP_UTF7";
6963 if (code_page == CP_UTF8)
6964 return "CP_UTF8";
6965
6966 *obj = PyBytes_FromFormat("cp%u", code_page);
6967 if (*obj == NULL)
6968 return NULL;
6969 return PyBytes_AS_STRING(*obj);
6970}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006971
Victor Stinner3a50e702011-10-18 21:21:00 +02006972static DWORD
6973decode_code_page_flags(UINT code_page)
6974{
6975 if (code_page == CP_UTF7) {
6976 /* The CP_UTF7 decoder only supports flags=0 */
6977 return 0;
6978 }
6979 else
6980 return MB_ERR_INVALID_CHARS;
6981}
6982
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006983/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 * Decode a byte string from a Windows code page into unicode object in strict
6985 * mode.
6986 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006987 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6988 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006989 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006990static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006991decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006992 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006993 const char *in,
6994 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006995{
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006997 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006999
7000 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007001 assert(insize > 0);
7002 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7003 if (outsize <= 0)
7004 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005
7006 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007008 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007009 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 if (*v == NULL)
7011 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007012 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013 }
7014 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007017 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020 }
7021
7022 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7024 if (outsize <= 0)
7025 goto error;
7026 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007027
Victor Stinner3a50e702011-10-18 21:21:00 +02007028error:
7029 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7030 return -2;
7031 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007032 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033}
7034
Victor Stinner3a50e702011-10-18 21:21:00 +02007035/*
7036 * Decode a byte string from a code page into unicode object with an error
7037 * handler.
7038 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007039 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 * UnicodeDecodeError exception and returns -1 on error.
7041 */
7042static int
7043decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 PyObject **v,
7045 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007046 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007047{
7048 const char *startin = in;
7049 const char *endin = in + size;
7050 const DWORD flags = decode_code_page_flags(code_page);
7051 /* Ideally, we should get reason from FormatMessage. This is the Windows
7052 2000 English version of the message. */
7053 const char *reason = "No mapping for the Unicode character exists "
7054 "in the target code page.";
7055 /* each step cannot decode more than 1 character, but a character can be
7056 represented as a surrogate pair */
7057 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007058 int insize;
7059 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007060 PyObject *errorHandler = NULL;
7061 PyObject *exc = NULL;
7062 PyObject *encoding_obj = NULL;
7063 char *encoding;
7064 DWORD err;
7065 int ret = -1;
7066
7067 assert(size > 0);
7068
7069 encoding = code_page_name(code_page, &encoding_obj);
7070 if (encoding == NULL)
7071 return -1;
7072
Victor Stinner7d00cc12014-03-17 23:08:06 +01007073 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7075 UnicodeDecodeError. */
7076 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7077 if (exc != NULL) {
7078 PyCodec_StrictErrors(exc);
7079 Py_CLEAR(exc);
7080 }
7081 goto error;
7082 }
7083
7084 if (*v == NULL) {
7085 /* Create unicode object */
7086 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7087 PyErr_NoMemory();
7088 goto error;
7089 }
Victor Stinnerab595942011-12-17 04:59:06 +01007090 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007091 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 if (*v == NULL)
7093 goto error;
7094 startout = PyUnicode_AS_UNICODE(*v);
7095 }
7096 else {
7097 /* Extend unicode object */
7098 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7099 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7100 PyErr_NoMemory();
7101 goto error;
7102 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007103 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007104 goto error;
7105 startout = PyUnicode_AS_UNICODE(*v) + n;
7106 }
7107
7108 /* Decode the byte string character per character */
7109 out = startout;
7110 while (in < endin)
7111 {
7112 /* Decode a character */
7113 insize = 1;
7114 do
7115 {
7116 outsize = MultiByteToWideChar(code_page, flags,
7117 in, insize,
7118 buffer, Py_ARRAY_LENGTH(buffer));
7119 if (outsize > 0)
7120 break;
7121 err = GetLastError();
7122 if (err != ERROR_NO_UNICODE_TRANSLATION
7123 && err != ERROR_INSUFFICIENT_BUFFER)
7124 {
7125 PyErr_SetFromWindowsErr(0);
7126 goto error;
7127 }
7128 insize++;
7129 }
7130 /* 4=maximum length of a UTF-8 sequence */
7131 while (insize <= 4 && (in + insize) <= endin);
7132
7133 if (outsize <= 0) {
7134 Py_ssize_t startinpos, endinpos, outpos;
7135
Victor Stinner7d00cc12014-03-17 23:08:06 +01007136 /* last character in partial decode? */
7137 if (in + insize >= endin && !final)
7138 break;
7139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 startinpos = in - startin;
7141 endinpos = startinpos + 1;
7142 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007143 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 errors, &errorHandler,
7145 encoding, reason,
7146 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007147 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 {
7149 goto error;
7150 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007151 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 }
7153 else {
7154 in += insize;
7155 memcpy(out, buffer, outsize * sizeof(wchar_t));
7156 out += outsize;
7157 }
7158 }
7159
7160 /* write a NUL character at the end */
7161 *out = 0;
7162
7163 /* Extend unicode object */
7164 outsize = out - startout;
7165 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007166 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007168 /* (in - startin) <= size and size is an int */
7169 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007170
7171error:
7172 Py_XDECREF(encoding_obj);
7173 Py_XDECREF(errorHandler);
7174 Py_XDECREF(exc);
7175 return ret;
7176}
7177
Victor Stinner3a50e702011-10-18 21:21:00 +02007178static PyObject *
7179decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007180 const char *s, Py_ssize_t size,
7181 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007182{
Victor Stinner76a31a62011-11-04 00:05:13 +01007183 PyObject *v = NULL;
7184 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 if (code_page < 0) {
7187 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7188 return NULL;
7189 }
7190
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007193
Victor Stinner76a31a62011-11-04 00:05:13 +01007194 do
7195 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007196#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 if (size > INT_MAX) {
7198 chunk_size = INT_MAX;
7199 final = 0;
7200 done = 0;
7201 }
7202 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007204 {
7205 chunk_size = (int)size;
7206 final = (consumed == NULL);
7207 done = 1;
7208 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 if (chunk_size == 0 && done) {
7211 if (v != NULL)
7212 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007213 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 converted = decode_code_page_strict(code_page, &v,
7217 s, chunk_size);
7218 if (converted == -2)
7219 converted = decode_code_page_errors(code_page, &v,
7220 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007221 errors, final);
7222 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007223
7224 if (converted < 0) {
7225 Py_XDECREF(v);
7226 return NULL;
7227 }
7228
7229 if (consumed)
7230 *consumed += converted;
7231
7232 s += converted;
7233 size -= converted;
7234 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007235
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007236 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007237}
7238
Alexander Belopolsky40018472011-02-26 01:02:56 +00007239PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007240PyUnicode_DecodeCodePageStateful(int code_page,
7241 const char *s,
7242 Py_ssize_t size,
7243 const char *errors,
7244 Py_ssize_t *consumed)
7245{
7246 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7247}
7248
7249PyObject *
7250PyUnicode_DecodeMBCSStateful(const char *s,
7251 Py_ssize_t size,
7252 const char *errors,
7253 Py_ssize_t *consumed)
7254{
7255 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7256}
7257
7258PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007259PyUnicode_DecodeMBCS(const char *s,
7260 Py_ssize_t size,
7261 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007262{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7264}
7265
Victor Stinner3a50e702011-10-18 21:21:00 +02007266static DWORD
7267encode_code_page_flags(UINT code_page, const char *errors)
7268{
7269 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007270 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 }
7272 else if (code_page == CP_UTF7) {
7273 /* CP_UTF7 only supports flags=0 */
7274 return 0;
7275 }
7276 else {
7277 if (errors != NULL && strcmp(errors, "replace") == 0)
7278 return 0;
7279 else
7280 return WC_NO_BEST_FIT_CHARS;
7281 }
7282}
7283
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 * Encode a Unicode string to a Windows code page into a byte string in strict
7286 * mode.
7287 *
7288 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007289 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007291static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007292encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007293 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007295{
Victor Stinner554f3f02010-06-16 23:33:54 +00007296 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007297 BOOL *pusedDefaultChar = &usedDefaultChar;
7298 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007299 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007300 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007301 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 const DWORD flags = encode_code_page_flags(code_page, NULL);
7303 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007304 /* Create a substring so that we can get the UTF-16 representation
7305 of just the slice under consideration. */
7306 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007307
Martin v. Löwis3d325192011-11-04 18:23:06 +01007308 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007309
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007311 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007313 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007314
Victor Stinner2fc507f2011-11-04 20:06:39 +01007315 substring = PyUnicode_Substring(unicode, offset, offset+len);
7316 if (substring == NULL)
7317 return -1;
7318 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7319 if (p == NULL) {
7320 Py_DECREF(substring);
7321 return -1;
7322 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007323 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007325 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007327 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 NULL, 0,
7329 NULL, pusedDefaultChar);
7330 if (outsize <= 0)
7331 goto error;
7332 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007333 if (pusedDefaultChar && *pusedDefaultChar) {
7334 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007336 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007337
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007341 if (*outbytes == NULL) {
7342 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007344 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346 }
7347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 const Py_ssize_t n = PyBytes_Size(*outbytes);
7350 if (outsize > PY_SSIZE_T_MAX - n) {
7351 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007352 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007355 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7356 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007358 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360 }
7361
7362 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007364 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 out, outsize,
7366 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007367 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 if (outsize <= 0)
7369 goto error;
7370 if (pusedDefaultChar && *pusedDefaultChar)
7371 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007372 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007373
Victor Stinner3a50e702011-10-18 21:21:00 +02007374error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007375 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7377 return -2;
7378 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007379 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007380}
7381
Victor Stinner3a50e702011-10-18 21:21:00 +02007382/*
7383 * Encode a Unicode string to a Windows code page into a byte string using a
7384 * error handler.
7385 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007386 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 * -1 on other error.
7388 */
7389static int
7390encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007391 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007392 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007393{
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007395 Py_ssize_t pos = unicode_offset;
7396 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 /* Ideally, we should get reason from FormatMessage. This is the Windows
7398 2000 English version of the message. */
7399 const char *reason = "invalid character";
7400 /* 4=maximum length of a UTF-8 sequence */
7401 char buffer[4];
7402 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7403 Py_ssize_t outsize;
7404 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 PyObject *errorHandler = NULL;
7406 PyObject *exc = NULL;
7407 PyObject *encoding_obj = NULL;
7408 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007409 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 PyObject *rep;
7411 int ret = -1;
7412
7413 assert(insize > 0);
7414
7415 encoding = code_page_name(code_page, &encoding_obj);
7416 if (encoding == NULL)
7417 return -1;
7418
7419 if (errors == NULL || strcmp(errors, "strict") == 0) {
7420 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7421 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007422 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 if (exc != NULL) {
7424 PyCodec_StrictErrors(exc);
7425 Py_DECREF(exc);
7426 }
7427 Py_XDECREF(encoding_obj);
7428 return -1;
7429 }
7430
7431 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7432 pusedDefaultChar = &usedDefaultChar;
7433 else
7434 pusedDefaultChar = NULL;
7435
7436 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7437 PyErr_NoMemory();
7438 goto error;
7439 }
7440 outsize = insize * Py_ARRAY_LENGTH(buffer);
7441
7442 if (*outbytes == NULL) {
7443 /* Create string object */
7444 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7445 if (*outbytes == NULL)
7446 goto error;
7447 out = PyBytes_AS_STRING(*outbytes);
7448 }
7449 else {
7450 /* Extend string object */
7451 Py_ssize_t n = PyBytes_Size(*outbytes);
7452 if (n > PY_SSIZE_T_MAX - outsize) {
7453 PyErr_NoMemory();
7454 goto error;
7455 }
7456 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7457 goto error;
7458 out = PyBytes_AS_STRING(*outbytes) + n;
7459 }
7460
7461 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007462 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007464 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7465 wchar_t chars[2];
7466 int charsize;
7467 if (ch < 0x10000) {
7468 chars[0] = (wchar_t)ch;
7469 charsize = 1;
7470 }
7471 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007472 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7473 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 charsize = 2;
7475 }
7476
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007478 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 buffer, Py_ARRAY_LENGTH(buffer),
7480 NULL, pusedDefaultChar);
7481 if (outsize > 0) {
7482 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7483 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007484 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 memcpy(out, buffer, outsize);
7486 out += outsize;
7487 continue;
7488 }
7489 }
7490 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7491 PyErr_SetFromWindowsErr(0);
7492 goto error;
7493 }
7494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 rep = unicode_encode_call_errorhandler(
7496 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007497 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007498 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 if (rep == NULL)
7500 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007501 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502
7503 if (PyBytes_Check(rep)) {
7504 outsize = PyBytes_GET_SIZE(rep);
7505 if (outsize != 1) {
7506 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7507 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7508 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7509 Py_DECREF(rep);
7510 goto error;
7511 }
7512 out = PyBytes_AS_STRING(*outbytes) + offset;
7513 }
7514 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7515 out += outsize;
7516 }
7517 else {
7518 Py_ssize_t i;
7519 enum PyUnicode_Kind kind;
7520 void *data;
7521
Benjamin Petersonbac79492012-01-14 13:34:47 -05007522 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 Py_DECREF(rep);
7524 goto error;
7525 }
7526
7527 outsize = PyUnicode_GET_LENGTH(rep);
7528 if (outsize != 1) {
7529 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7530 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7531 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7532 Py_DECREF(rep);
7533 goto error;
7534 }
7535 out = PyBytes_AS_STRING(*outbytes) + offset;
7536 }
7537 kind = PyUnicode_KIND(rep);
7538 data = PyUnicode_DATA(rep);
7539 for (i=0; i < outsize; i++) {
7540 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7541 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007542 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007543 encoding, unicode,
7544 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 "unable to encode error handler result to ASCII");
7546 Py_DECREF(rep);
7547 goto error;
7548 }
7549 *out = (unsigned char)ch;
7550 out++;
7551 }
7552 }
7553 Py_DECREF(rep);
7554 }
7555 /* write a NUL byte */
7556 *out = 0;
7557 outsize = out - PyBytes_AS_STRING(*outbytes);
7558 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7559 if (_PyBytes_Resize(outbytes, outsize) < 0)
7560 goto error;
7561 ret = 0;
7562
7563error:
7564 Py_XDECREF(encoding_obj);
7565 Py_XDECREF(errorHandler);
7566 Py_XDECREF(exc);
7567 return ret;
7568}
7569
Victor Stinner3a50e702011-10-18 21:21:00 +02007570static PyObject *
7571encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007572 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 const char *errors)
7574{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007575 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007577 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007578 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007579
Victor Stinner29dacf22015-01-26 16:41:32 +01007580 if (!PyUnicode_Check(unicode)) {
7581 PyErr_BadArgument();
7582 return NULL;
7583 }
7584
Benjamin Petersonbac79492012-01-14 13:34:47 -05007585 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 return NULL;
7587 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007588
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 if (code_page < 0) {
7590 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7591 return NULL;
7592 }
7593
Martin v. Löwis3d325192011-11-04 18:23:06 +01007594 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007595 return PyBytes_FromStringAndSize(NULL, 0);
7596
Victor Stinner7581cef2011-11-03 22:32:33 +01007597 offset = 0;
7598 do
7599 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007600#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007601 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007602 chunks. */
7603 if (len > INT_MAX/2) {
7604 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007605 done = 0;
7606 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007607 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007608#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007609 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007610 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007611 done = 1;
7612 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007613
Victor Stinner76a31a62011-11-04 00:05:13 +01007614 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007615 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007616 errors);
7617 if (ret == -2)
7618 ret = encode_code_page_errors(code_page, &outbytes,
7619 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007621 if (ret < 0) {
7622 Py_XDECREF(outbytes);
7623 return NULL;
7624 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007625
Victor Stinner7581cef2011-11-03 22:32:33 +01007626 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007628 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007629
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 return outbytes;
7631}
7632
7633PyObject *
7634PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7635 Py_ssize_t size,
7636 const char *errors)
7637{
Victor Stinner7581cef2011-11-03 22:32:33 +01007638 PyObject *unicode, *res;
7639 unicode = PyUnicode_FromUnicode(p, size);
7640 if (unicode == NULL)
7641 return NULL;
7642 res = encode_code_page(CP_ACP, unicode, errors);
7643 Py_DECREF(unicode);
7644 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007645}
7646
7647PyObject *
7648PyUnicode_EncodeCodePage(int code_page,
7649 PyObject *unicode,
7650 const char *errors)
7651{
Victor Stinner7581cef2011-11-03 22:32:33 +01007652 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007653}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007654
Alexander Belopolsky40018472011-02-26 01:02:56 +00007655PyObject *
7656PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007657{
Victor Stinner7581cef2011-11-03 22:32:33 +01007658 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007659}
7660
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007661#undef NEED_RETRY
7662
Victor Stinner99b95382011-07-04 14:23:54 +02007663#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007664
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665/* --- Character Mapping Codec -------------------------------------------- */
7666
Victor Stinnerfb161b12013-04-18 01:44:27 +02007667static int
7668charmap_decode_string(const char *s,
7669 Py_ssize_t size,
7670 PyObject *mapping,
7671 const char *errors,
7672 _PyUnicodeWriter *writer)
7673{
7674 const char *starts = s;
7675 const char *e;
7676 Py_ssize_t startinpos, endinpos;
7677 PyObject *errorHandler = NULL, *exc = NULL;
7678 Py_ssize_t maplen;
7679 enum PyUnicode_Kind mapkind;
7680 void *mapdata;
7681 Py_UCS4 x;
7682 unsigned char ch;
7683
7684 if (PyUnicode_READY(mapping) == -1)
7685 return -1;
7686
7687 maplen = PyUnicode_GET_LENGTH(mapping);
7688 mapdata = PyUnicode_DATA(mapping);
7689 mapkind = PyUnicode_KIND(mapping);
7690
7691 e = s + size;
7692
7693 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7694 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7695 * is disabled in encoding aliases, latin1 is preferred because
7696 * its implementation is faster. */
7697 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7698 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7699 Py_UCS4 maxchar = writer->maxchar;
7700
7701 assert (writer->kind == PyUnicode_1BYTE_KIND);
7702 while (s < e) {
7703 ch = *s;
7704 x = mapdata_ucs1[ch];
7705 if (x > maxchar) {
7706 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7707 goto onError;
7708 maxchar = writer->maxchar;
7709 outdata = (Py_UCS1 *)writer->data;
7710 }
7711 outdata[writer->pos] = x;
7712 writer->pos++;
7713 ++s;
7714 }
7715 return 0;
7716 }
7717
7718 while (s < e) {
7719 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7720 enum PyUnicode_Kind outkind = writer->kind;
7721 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7722 if (outkind == PyUnicode_1BYTE_KIND) {
7723 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7724 Py_UCS4 maxchar = writer->maxchar;
7725 while (s < e) {
7726 ch = *s;
7727 x = mapdata_ucs2[ch];
7728 if (x > maxchar)
7729 goto Error;
7730 outdata[writer->pos] = x;
7731 writer->pos++;
7732 ++s;
7733 }
7734 break;
7735 }
7736 else if (outkind == PyUnicode_2BYTE_KIND) {
7737 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7738 while (s < e) {
7739 ch = *s;
7740 x = mapdata_ucs2[ch];
7741 if (x == 0xFFFE)
7742 goto Error;
7743 outdata[writer->pos] = x;
7744 writer->pos++;
7745 ++s;
7746 }
7747 break;
7748 }
7749 }
7750 ch = *s;
7751
7752 if (ch < maplen)
7753 x = PyUnicode_READ(mapkind, mapdata, ch);
7754 else
7755 x = 0xfffe; /* invalid value */
7756Error:
7757 if (x == 0xfffe)
7758 {
7759 /* undefined mapping */
7760 startinpos = s-starts;
7761 endinpos = startinpos+1;
7762 if (unicode_decode_call_errorhandler_writer(
7763 errors, &errorHandler,
7764 "charmap", "character maps to <undefined>",
7765 &starts, &e, &startinpos, &endinpos, &exc, &s,
7766 writer)) {
7767 goto onError;
7768 }
7769 continue;
7770 }
7771
7772 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7773 goto onError;
7774 ++s;
7775 }
7776 Py_XDECREF(errorHandler);
7777 Py_XDECREF(exc);
7778 return 0;
7779
7780onError:
7781 Py_XDECREF(errorHandler);
7782 Py_XDECREF(exc);
7783 return -1;
7784}
7785
7786static int
7787charmap_decode_mapping(const char *s,
7788 Py_ssize_t size,
7789 PyObject *mapping,
7790 const char *errors,
7791 _PyUnicodeWriter *writer)
7792{
7793 const char *starts = s;
7794 const char *e;
7795 Py_ssize_t startinpos, endinpos;
7796 PyObject *errorHandler = NULL, *exc = NULL;
7797 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007798 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007799
7800 e = s + size;
7801
7802 while (s < e) {
7803 ch = *s;
7804
7805 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7806 key = PyLong_FromLong((long)ch);
7807 if (key == NULL)
7808 goto onError;
7809
7810 item = PyObject_GetItem(mapping, key);
7811 Py_DECREF(key);
7812 if (item == NULL) {
7813 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7814 /* No mapping found means: mapping is undefined. */
7815 PyErr_Clear();
7816 goto Undefined;
7817 } else
7818 goto onError;
7819 }
7820
7821 /* Apply mapping */
7822 if (item == Py_None)
7823 goto Undefined;
7824 if (PyLong_Check(item)) {
7825 long value = PyLong_AS_LONG(item);
7826 if (value == 0xFFFE)
7827 goto Undefined;
7828 if (value < 0 || value > MAX_UNICODE) {
7829 PyErr_Format(PyExc_TypeError,
7830 "character mapping must be in range(0x%lx)",
7831 (unsigned long)MAX_UNICODE + 1);
7832 goto onError;
7833 }
7834
7835 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7836 goto onError;
7837 }
7838 else if (PyUnicode_Check(item)) {
7839 if (PyUnicode_READY(item) == -1)
7840 goto onError;
7841 if (PyUnicode_GET_LENGTH(item) == 1) {
7842 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7843 if (value == 0xFFFE)
7844 goto Undefined;
7845 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7846 goto onError;
7847 }
7848 else {
7849 writer->overallocate = 1;
7850 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7851 goto onError;
7852 }
7853 }
7854 else {
7855 /* wrong return value */
7856 PyErr_SetString(PyExc_TypeError,
7857 "character mapping must return integer, None or str");
7858 goto onError;
7859 }
7860 Py_CLEAR(item);
7861 ++s;
7862 continue;
7863
7864Undefined:
7865 /* undefined mapping */
7866 Py_CLEAR(item);
7867 startinpos = s-starts;
7868 endinpos = startinpos+1;
7869 if (unicode_decode_call_errorhandler_writer(
7870 errors, &errorHandler,
7871 "charmap", "character maps to <undefined>",
7872 &starts, &e, &startinpos, &endinpos, &exc, &s,
7873 writer)) {
7874 goto onError;
7875 }
7876 }
7877 Py_XDECREF(errorHandler);
7878 Py_XDECREF(exc);
7879 return 0;
7880
7881onError:
7882 Py_XDECREF(item);
7883 Py_XDECREF(errorHandler);
7884 Py_XDECREF(exc);
7885 return -1;
7886}
7887
Alexander Belopolsky40018472011-02-26 01:02:56 +00007888PyObject *
7889PyUnicode_DecodeCharmap(const char *s,
7890 Py_ssize_t size,
7891 PyObject *mapping,
7892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007894 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007895
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 /* Default to Latin-1 */
7897 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007901 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007902 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007903 writer.min_length = size;
7904 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007906
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007907 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007908 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7909 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007910 }
7911 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007912 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7913 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007915 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007916
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007918 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 return NULL;
7920}
7921
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922/* Charmap encoding: the lookup table */
7923
Alexander Belopolsky40018472011-02-26 01:02:56 +00007924struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 PyObject_HEAD
7926 unsigned char level1[32];
7927 int count2, count3;
7928 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007929};
7930
7931static PyObject*
7932encoding_map_size(PyObject *obj, PyObject* args)
7933{
7934 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007935 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007937}
7938
7939static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007940 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 PyDoc_STR("Return the size (in bytes) of this object") },
7942 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943};
7944
7945static void
7946encoding_map_dealloc(PyObject* o)
7947{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007948 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949}
7950
7951static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007952 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 "EncodingMap", /*tp_name*/
7954 sizeof(struct encoding_map), /*tp_basicsize*/
7955 0, /*tp_itemsize*/
7956 /* methods */
7957 encoding_map_dealloc, /*tp_dealloc*/
7958 0, /*tp_print*/
7959 0, /*tp_getattr*/
7960 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007961 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 0, /*tp_repr*/
7963 0, /*tp_as_number*/
7964 0, /*tp_as_sequence*/
7965 0, /*tp_as_mapping*/
7966 0, /*tp_hash*/
7967 0, /*tp_call*/
7968 0, /*tp_str*/
7969 0, /*tp_getattro*/
7970 0, /*tp_setattro*/
7971 0, /*tp_as_buffer*/
7972 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7973 0, /*tp_doc*/
7974 0, /*tp_traverse*/
7975 0, /*tp_clear*/
7976 0, /*tp_richcompare*/
7977 0, /*tp_weaklistoffset*/
7978 0, /*tp_iter*/
7979 0, /*tp_iternext*/
7980 encoding_map_methods, /*tp_methods*/
7981 0, /*tp_members*/
7982 0, /*tp_getset*/
7983 0, /*tp_base*/
7984 0, /*tp_dict*/
7985 0, /*tp_descr_get*/
7986 0, /*tp_descr_set*/
7987 0, /*tp_dictoffset*/
7988 0, /*tp_init*/
7989 0, /*tp_alloc*/
7990 0, /*tp_new*/
7991 0, /*tp_free*/
7992 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993};
7994
7995PyObject*
7996PyUnicode_BuildEncodingMap(PyObject* string)
7997{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998 PyObject *result;
7999 struct encoding_map *mresult;
8000 int i;
8001 int need_dict = 0;
8002 unsigned char level1[32];
8003 unsigned char level2[512];
8004 unsigned char *mlevel1, *mlevel2, *mlevel3;
8005 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008006 int kind;
8007 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008008 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008009 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008011 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008012 PyErr_BadArgument();
8013 return NULL;
8014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008015 kind = PyUnicode_KIND(string);
8016 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008017 length = PyUnicode_GET_LENGTH(string);
8018 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008019 memset(level1, 0xFF, sizeof level1);
8020 memset(level2, 0xFF, sizeof level2);
8021
8022 /* If there isn't a one-to-one mapping of NULL to \0,
8023 or if there are non-BMP characters, we need to use
8024 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008025 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008027 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 ch = PyUnicode_READ(kind, data, i);
8030 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031 need_dict = 1;
8032 break;
8033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008034 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035 /* unmapped character */
8036 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008037 l1 = ch >> 11;
8038 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039 if (level1[l1] == 0xFF)
8040 level1[l1] = count2++;
8041 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008042 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 }
8044
8045 if (count2 >= 0xFF || count3 >= 0xFF)
8046 need_dict = 1;
8047
8048 if (need_dict) {
8049 PyObject *result = PyDict_New();
8050 PyObject *key, *value;
8051 if (!result)
8052 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008053 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008054 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008055 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056 if (!key || !value)
8057 goto failed1;
8058 if (PyDict_SetItem(result, key, value) == -1)
8059 goto failed1;
8060 Py_DECREF(key);
8061 Py_DECREF(value);
8062 }
8063 return result;
8064 failed1:
8065 Py_XDECREF(key);
8066 Py_XDECREF(value);
8067 Py_DECREF(result);
8068 return NULL;
8069 }
8070
8071 /* Create a three-level trie */
8072 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8073 16*count2 + 128*count3 - 1);
8074 if (!result)
8075 return PyErr_NoMemory();
8076 PyObject_Init(result, &EncodingMapType);
8077 mresult = (struct encoding_map*)result;
8078 mresult->count2 = count2;
8079 mresult->count3 = count3;
8080 mlevel1 = mresult->level1;
8081 mlevel2 = mresult->level23;
8082 mlevel3 = mresult->level23 + 16*count2;
8083 memcpy(mlevel1, level1, 32);
8084 memset(mlevel2, 0xFF, 16*count2);
8085 memset(mlevel3, 0, 128*count3);
8086 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008087 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008089 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8090 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091 /* unmapped character */
8092 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008093 o1 = ch>>11;
8094 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 i2 = 16*mlevel1[o1] + o2;
8096 if (mlevel2[i2] == 0xFF)
8097 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008098 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099 i3 = 128*mlevel2[i2] + o3;
8100 mlevel3[i3] = i;
8101 }
8102 return result;
8103}
8104
8105static int
Victor Stinner22168992011-11-20 17:09:18 +01008106encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107{
8108 struct encoding_map *map = (struct encoding_map*)mapping;
8109 int l1 = c>>11;
8110 int l2 = (c>>7) & 0xF;
8111 int l3 = c & 0x7F;
8112 int i;
8113
Victor Stinner22168992011-11-20 17:09:18 +01008114 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 if (c == 0)
8117 return 0;
8118 /* level 1*/
8119 i = map->level1[l1];
8120 if (i == 0xFF) {
8121 return -1;
8122 }
8123 /* level 2*/
8124 i = map->level23[16*i+l2];
8125 if (i == 0xFF) {
8126 return -1;
8127 }
8128 /* level 3 */
8129 i = map->level23[16*map->count2 + 128*i + l3];
8130 if (i == 0) {
8131 return -1;
8132 }
8133 return i;
8134}
8135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136/* Lookup the character ch in the mapping. If the character
8137 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008138 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008139static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008140charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141{
Christian Heimes217cfd12007-12-02 14:31:20 +00008142 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143 PyObject *x;
8144
8145 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 x = PyObject_GetItem(mapping, w);
8148 Py_DECREF(w);
8149 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8151 /* No mapping found means: mapping is undefined. */
8152 PyErr_Clear();
8153 x = Py_None;
8154 Py_INCREF(x);
8155 return x;
8156 } else
8157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008159 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008161 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 long value = PyLong_AS_LONG(x);
8163 if (value < 0 || value > 255) {
8164 PyErr_SetString(PyExc_TypeError,
8165 "character mapping must be in range(256)");
8166 Py_DECREF(x);
8167 return NULL;
8168 }
8169 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008171 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 /* wrong return value */
8175 PyErr_Format(PyExc_TypeError,
8176 "character mapping must return integer, bytes or None, not %.400s",
8177 x->ob_type->tp_name);
8178 Py_DECREF(x);
8179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 }
8181}
8182
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008184charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8187 /* exponentially overallocate to minimize reallocations */
8188 if (requiredsize < 2*outsize)
8189 requiredsize = 2*outsize;
8190 if (_PyBytes_Resize(outobj, requiredsize))
8191 return -1;
8192 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193}
8194
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008197} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008199 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200 space is available. Return a new reference to the object that
8201 was put in the output buffer, or Py_None, if the mapping was undefined
8202 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008203 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008205charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008206 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008208 PyObject *rep;
8209 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008210 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211
Christian Heimes90aa7642007-12-19 02:45:37 +00008212 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 if (res == -1)
8216 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 if (outsize<requiredsize)
8218 if (charmapencode_resize(outobj, outpos, requiredsize))
8219 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008220 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 outstart[(*outpos)++] = (char)res;
8222 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223 }
8224
8225 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008226 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008228 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 Py_DECREF(rep);
8230 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008231 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 if (PyLong_Check(rep)) {
8233 Py_ssize_t requiredsize = *outpos+1;
8234 if (outsize<requiredsize)
8235 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8236 Py_DECREF(rep);
8237 return enc_EXCEPTION;
8238 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008239 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 else {
8243 const char *repchars = PyBytes_AS_STRING(rep);
8244 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8245 Py_ssize_t requiredsize = *outpos+repsize;
8246 if (outsize<requiredsize)
8247 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8248 Py_DECREF(rep);
8249 return enc_EXCEPTION;
8250 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008251 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 memcpy(outstart + *outpos, repchars, repsize);
8253 *outpos += repsize;
8254 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008256 Py_DECREF(rep);
8257 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258}
8259
8260/* handle an error in PyUnicode_EncodeCharmap
8261 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008262static int
8263charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008264 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008266 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008267 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268{
8269 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008270 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008271 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008272 enum PyUnicode_Kind kind;
8273 void *data;
8274 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008276 Py_ssize_t collstartpos = *inpos;
8277 Py_ssize_t collendpos = *inpos+1;
8278 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 char *encoding = "charmap";
8280 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008282 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008283 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284
Benjamin Petersonbac79492012-01-14 13:34:47 -05008285 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008286 return -1;
8287 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288 /* find all unencodable characters */
8289 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008291 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008292 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008293 val = encoding_map_lookup(ch, mapping);
8294 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 break;
8296 ++collendpos;
8297 continue;
8298 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008300 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8301 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 if (rep==NULL)
8303 return -1;
8304 else if (rep!=Py_None) {
8305 Py_DECREF(rep);
8306 break;
8307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008308 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310 }
8311 /* cache callback name lookup
8312 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008313 if (*error_handler == _Py_ERROR_UNKNOWN)
8314 *error_handler = get_error_handler(errors);
8315
8316 switch (*error_handler) {
8317 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008318 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008320
8321 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008322 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 x = charmapencode_output('?', mapping, res, respos);
8324 if (x==enc_EXCEPTION) {
8325 return -1;
8326 }
8327 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008328 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 return -1;
8330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008331 }
8332 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008333 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008334 *inpos = collendpos;
8335 break;
Victor Stinner50149202015-09-22 00:26:54 +02008336
8337 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008338 /* generate replacement (temporarily (mis)uses p) */
8339 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 char buffer[2+29+1+1];
8341 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008342 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 for (cp = buffer; *cp; ++cp) {
8344 x = charmapencode_output(*cp, mapping, res, respos);
8345 if (x==enc_EXCEPTION)
8346 return -1;
8347 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008348 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return -1;
8350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008351 }
8352 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008353 *inpos = collendpos;
8354 break;
Victor Stinner50149202015-09-22 00:26:54 +02008355
Benjamin Peterson14339b62009-01-31 16:36:08 +00008356 default:
Victor Stinner50149202015-09-22 00:26:54 +02008357 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008358 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008360 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008362 if (PyBytes_Check(repunicode)) {
8363 /* Directly copy bytes result to output. */
8364 Py_ssize_t outsize = PyBytes_Size(*res);
8365 Py_ssize_t requiredsize;
8366 repsize = PyBytes_Size(repunicode);
8367 requiredsize = *respos + repsize;
8368 if (requiredsize > outsize)
8369 /* Make room for all additional bytes. */
8370 if (charmapencode_resize(res, respos, requiredsize)) {
8371 Py_DECREF(repunicode);
8372 return -1;
8373 }
8374 memcpy(PyBytes_AsString(*res) + *respos,
8375 PyBytes_AsString(repunicode), repsize);
8376 *respos += repsize;
8377 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008378 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008379 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008381 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008382 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008383 Py_DECREF(repunicode);
8384 return -1;
8385 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008386 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008387 data = PyUnicode_DATA(repunicode);
8388 kind = PyUnicode_KIND(repunicode);
8389 for (index = 0; index < repsize; index++) {
8390 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8391 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008393 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 return -1;
8395 }
8396 else if (x==enc_FAILED) {
8397 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008398 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 return -1;
8400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401 }
8402 *inpos = newpos;
8403 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 }
8405 return 0;
8406}
8407
Alexander Belopolsky40018472011-02-26 01:02:56 +00008408PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008409_PyUnicode_EncodeCharmap(PyObject *unicode,
8410 PyObject *mapping,
8411 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 /* output object */
8414 PyObject *res = NULL;
8415 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008416 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008419 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008420 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008422 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008423 void *data;
8424 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425
Benjamin Petersonbac79492012-01-14 13:34:47 -05008426 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008427 return NULL;
8428 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008429 data = PyUnicode_DATA(unicode);
8430 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008431
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 /* Default to Latin-1 */
8433 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008434 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 /* allocate enough for a simple encoding without
8437 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008438 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 if (res == NULL)
8440 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008441 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008445 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008447 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 if (x==enc_EXCEPTION) /* error */
8449 goto onError;
8450 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008451 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008453 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 &res, &respos)) {
8455 goto onError;
8456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 else
8459 /* done with this character => adjust input position */
8460 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008464 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008465 if (_PyBytes_Resize(&res, respos) < 0)
8466 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008467
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008469 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 return res;
8471
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 Py_XDECREF(res);
8474 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008475 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 return NULL;
8477}
8478
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008479/* Deprecated */
8480PyObject *
8481PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8482 Py_ssize_t size,
8483 PyObject *mapping,
8484 const char *errors)
8485{
8486 PyObject *result;
8487 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8488 if (unicode == NULL)
8489 return NULL;
8490 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8491 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008492 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008493}
8494
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495PyObject *
8496PyUnicode_AsCharmapString(PyObject *unicode,
8497 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498{
8499 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 PyErr_BadArgument();
8501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008503 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504}
8505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008507static void
8508make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008510 Py_ssize_t startpos, Py_ssize_t endpos,
8511 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 *exceptionObject = _PyUnicodeTranslateError_Create(
8515 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 }
8517 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8519 goto onError;
8520 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8521 goto onError;
8522 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8523 goto onError;
8524 return;
8525 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008526 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527 }
8528}
8529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530/* error handling callback helper:
8531 build arguments, call the callback and check the arguments,
8532 put the result into newpos and return the replacement string, which
8533 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534static PyObject *
8535unicode_translate_call_errorhandler(const char *errors,
8536 PyObject **errorHandler,
8537 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008539 Py_ssize_t startpos, Py_ssize_t endpos,
8540 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008542 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008544 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 PyObject *restuple;
8546 PyObject *resunicode;
8547
8548 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 }
8553
8554 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558
8559 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008564 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 Py_DECREF(restuple);
8566 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 }
8568 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 &resunicode, &i_newpos)) {
8570 Py_DECREF(restuple);
8571 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008573 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008575 else
8576 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008578 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 Py_DECREF(restuple);
8580 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008581 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 Py_INCREF(resunicode);
8583 Py_DECREF(restuple);
8584 return resunicode;
8585}
8586
8587/* Lookup the character ch in the mapping and put the result in result,
8588 which must be decrefed by the caller.
8589 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008590static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592{
Christian Heimes217cfd12007-12-02 14:31:20 +00008593 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 PyObject *x;
8595
8596 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 x = PyObject_GetItem(mapping, w);
8599 Py_DECREF(w);
8600 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8602 /* No mapping found means: use 1:1 mapping. */
8603 PyErr_Clear();
8604 *result = NULL;
8605 return 0;
8606 } else
8607 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608 }
8609 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 *result = x;
8611 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008613 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008615 if (value < 0 || value > MAX_UNICODE) {
8616 PyErr_Format(PyExc_ValueError,
8617 "character mapping must be in range(0x%x)",
8618 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 Py_DECREF(x);
8620 return -1;
8621 }
8622 *result = x;
8623 return 0;
8624 }
8625 else if (PyUnicode_Check(x)) {
8626 *result = x;
8627 return 0;
8628 }
8629 else {
8630 /* wrong return value */
8631 PyErr_SetString(PyExc_TypeError,
8632 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008633 Py_DECREF(x);
8634 return -1;
8635 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636}
Victor Stinner1194ea02014-04-04 19:37:40 +02008637
8638/* lookup the character, write the result into the writer.
8639 Return 1 if the result was written into the writer, return 0 if the mapping
8640 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008642charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8643 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644{
Victor Stinner1194ea02014-04-04 19:37:40 +02008645 PyObject *item;
8646
8647 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008649
8650 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008652 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008655 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008657
8658 if (item == Py_None) {
8659 Py_DECREF(item);
8660 return 0;
8661 }
8662
8663 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008664 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8665 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8666 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008667 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8668 Py_DECREF(item);
8669 return -1;
8670 }
8671 Py_DECREF(item);
8672 return 1;
8673 }
8674
8675 if (!PyUnicode_Check(item)) {
8676 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008678 }
8679
8680 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8681 Py_DECREF(item);
8682 return -1;
8683 }
8684
8685 Py_DECREF(item);
8686 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687}
8688
Victor Stinner89a76ab2014-04-05 11:44:04 +02008689static int
8690unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8691 Py_UCS1 *translate)
8692{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008693 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008694 int ret = 0;
8695
Victor Stinner89a76ab2014-04-05 11:44:04 +02008696 if (charmaptranslate_lookup(ch, mapping, &item)) {
8697 return -1;
8698 }
8699
8700 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008701 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008702 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008703 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008704 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008705 /* not found => default to 1:1 mapping */
8706 translate[ch] = ch;
8707 return 1;
8708 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008709 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008710 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008711 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8712 used it */
8713 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008714 /* invalid character or character outside ASCII:
8715 skip the fast translate */
8716 goto exit;
8717 }
8718 translate[ch] = (Py_UCS1)replace;
8719 }
8720 else if (PyUnicode_Check(item)) {
8721 Py_UCS4 replace;
8722
8723 if (PyUnicode_READY(item) == -1) {
8724 Py_DECREF(item);
8725 return -1;
8726 }
8727 if (PyUnicode_GET_LENGTH(item) != 1)
8728 goto exit;
8729
8730 replace = PyUnicode_READ_CHAR(item, 0);
8731 if (replace > 127)
8732 goto exit;
8733 translate[ch] = (Py_UCS1)replace;
8734 }
8735 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008736 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008737 goto exit;
8738 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008739 ret = 1;
8740
Benjamin Peterson1365de72014-04-07 20:15:41 -04008741 exit:
8742 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008743 return ret;
8744}
8745
8746/* Fast path for ascii => ascii translation. Return 1 if the whole string
8747 was translated into writer, return 0 if the input string was partially
8748 translated into writer, raise an exception and return -1 on error. */
8749static int
8750unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008751 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008752{
Victor Stinner872b2912014-04-05 14:27:07 +02008753 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008754 Py_ssize_t len;
8755 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008756 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008757
8758 if (PyUnicode_READY(input) == -1)
8759 return -1;
8760 if (!PyUnicode_IS_ASCII(input))
8761 return 0;
8762 len = PyUnicode_GET_LENGTH(input);
8763
Victor Stinner872b2912014-04-05 14:27:07 +02008764 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008765
8766 in = PyUnicode_1BYTE_DATA(input);
8767 end = in + len;
8768
8769 assert(PyUnicode_IS_ASCII(writer->buffer));
8770 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8771 out = PyUnicode_1BYTE_DATA(writer->buffer);
8772
Victor Stinner872b2912014-04-05 14:27:07 +02008773 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008774 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008775 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008776 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008777 int translate = unicode_fast_translate_lookup(mapping, ch,
8778 ascii_table);
8779 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008780 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008781 if (translate == 0)
8782 goto exit;
8783 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008784 }
Victor Stinner872b2912014-04-05 14:27:07 +02008785 if (ch2 == 0xfe) {
8786 if (ignore)
8787 continue;
8788 goto exit;
8789 }
8790 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008791 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008792 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008793 }
Victor Stinner872b2912014-04-05 14:27:07 +02008794 res = 1;
8795
8796exit:
8797 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8798 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008799}
8800
Victor Stinner3222da22015-10-01 22:07:32 +02008801static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802_PyUnicode_TranslateCharmap(PyObject *input,
8803 PyObject *mapping,
8804 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008807 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 Py_ssize_t size, i;
8809 int kind;
8810 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008811 _PyUnicodeWriter writer;
8812 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008813 char *reason = "character maps to <undefined>";
8814 PyObject *errorHandler = NULL;
8815 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008816 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008817 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 PyErr_BadArgument();
8821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 if (PyUnicode_READY(input) == -1)
8825 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008826 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 kind = PyUnicode_KIND(input);
8828 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829
8830 if (size == 0) {
8831 Py_INCREF(input);
8832 return input;
8833 }
8834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008835 /* allocate enough for a simple 1:1 translation without
8836 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008837 _PyUnicodeWriter_Init(&writer);
8838 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840
Victor Stinner872b2912014-04-05 14:27:07 +02008841 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8842
8843 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008844 if (res < 0) {
8845 _PyUnicodeWriter_Dealloc(&writer);
8846 return NULL;
8847 }
8848 if (res == 1)
8849 return _PyUnicodeWriter_Finish(&writer);
8850
Victor Stinner89a76ab2014-04-05 11:44:04 +02008851 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008854 int translate;
8855 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8856 Py_ssize_t newpos;
8857 /* startpos for collecting untranslatable chars */
8858 Py_ssize_t collstart;
8859 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008860 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861
Victor Stinner1194ea02014-04-04 19:37:40 +02008862 ch = PyUnicode_READ(kind, data, i);
8863 translate = charmaptranslate_output(ch, mapping, &writer);
8864 if (translate < 0)
8865 goto onError;
8866
8867 if (translate != 0) {
8868 /* it worked => adjust input pointer */
8869 ++i;
8870 continue;
8871 }
8872
8873 /* untranslatable character */
8874 collstart = i;
8875 collend = i+1;
8876
8877 /* find all untranslatable characters */
8878 while (collend < size) {
8879 PyObject *x;
8880 ch = PyUnicode_READ(kind, data, collend);
8881 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008882 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008883 Py_XDECREF(x);
8884 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008886 ++collend;
8887 }
8888
8889 if (ignore) {
8890 i = collend;
8891 }
8892 else {
8893 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8894 reason, input, &exc,
8895 collstart, collend, &newpos);
8896 if (repunicode == NULL)
8897 goto onError;
8898 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008900 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008901 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008902 Py_DECREF(repunicode);
8903 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008904 }
8905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008906 Py_XDECREF(exc);
8907 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008908 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008911 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008912 Py_XDECREF(exc);
8913 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 return NULL;
8915}
8916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917/* Deprecated. Use PyUnicode_Translate instead. */
8918PyObject *
8919PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8920 Py_ssize_t size,
8921 PyObject *mapping,
8922 const char *errors)
8923{
Christian Heimes5f520f42012-09-11 14:03:25 +02008924 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8926 if (!unicode)
8927 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008928 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8929 Py_DECREF(unicode);
8930 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931}
8932
Alexander Belopolsky40018472011-02-26 01:02:56 +00008933PyObject *
8934PyUnicode_Translate(PyObject *str,
8935 PyObject *mapping,
8936 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937{
8938 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008939
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 str = PyUnicode_FromObject(str);
8941 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008942 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 Py_DECREF(str);
8945 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946}
Tim Petersced69f82003-09-16 20:30:58 +00008947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008949fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950{
8951 /* No need to call PyUnicode_READY(self) because this function is only
8952 called as a callback from fixup() which does it already. */
8953 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8954 const int kind = PyUnicode_KIND(self);
8955 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008956 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008957 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 Py_ssize_t i;
8959
8960 for (i = 0; i < len; ++i) {
8961 ch = PyUnicode_READ(kind, data, i);
8962 fixed = 0;
8963 if (ch > 127) {
8964 if (Py_UNICODE_ISSPACE(ch))
8965 fixed = ' ';
8966 else {
8967 const int decimal = Py_UNICODE_TODECIMAL(ch);
8968 if (decimal >= 0)
8969 fixed = '0' + decimal;
8970 }
8971 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008972 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008973 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 PyUnicode_WRITE(kind, data, i, fixed);
8975 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008976 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008977 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 }
8980
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008981 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982}
8983
8984PyObject *
8985_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8986{
8987 if (!PyUnicode_Check(unicode)) {
8988 PyErr_BadInternalCall();
8989 return NULL;
8990 }
8991 if (PyUnicode_READY(unicode) == -1)
8992 return NULL;
8993 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8994 /* If the string is already ASCII, just return the same string */
8995 Py_INCREF(unicode);
8996 return unicode;
8997 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008998 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999}
9000
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009001PyObject *
9002PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9003 Py_ssize_t length)
9004{
Victor Stinnerf0124502011-11-21 23:12:56 +01009005 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009006 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009007 Py_UCS4 maxchar;
9008 enum PyUnicode_Kind kind;
9009 void *data;
9010
Victor Stinner99d7ad02012-02-22 13:37:39 +01009011 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009012 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009013 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009014 if (ch > 127) {
9015 int decimal = Py_UNICODE_TODECIMAL(ch);
9016 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009017 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009018 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009019 }
9020 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009021
9022 /* Copy to a new string */
9023 decimal = PyUnicode_New(length, maxchar);
9024 if (decimal == NULL)
9025 return decimal;
9026 kind = PyUnicode_KIND(decimal);
9027 data = PyUnicode_DATA(decimal);
9028 /* Iterate over code points */
9029 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009030 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009031 if (ch > 127) {
9032 int decimal = Py_UNICODE_TODECIMAL(ch);
9033 if (decimal >= 0)
9034 ch = '0' + decimal;
9035 }
9036 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009038 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009039}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009040/* --- Decimal Encoder ---------------------------------------------------- */
9041
Alexander Belopolsky40018472011-02-26 01:02:56 +00009042int
9043PyUnicode_EncodeDecimal(Py_UNICODE *s,
9044 Py_ssize_t length,
9045 char *output,
9046 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009047{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009048 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009049 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009050 enum PyUnicode_Kind kind;
9051 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009052
9053 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009054 PyErr_BadArgument();
9055 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009056 }
9057
Victor Stinner42bf7752011-11-21 22:52:58 +01009058 unicode = PyUnicode_FromUnicode(s, length);
9059 if (unicode == NULL)
9060 return -1;
9061
Benjamin Petersonbac79492012-01-14 13:34:47 -05009062 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009063 Py_DECREF(unicode);
9064 return -1;
9065 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009066 kind = PyUnicode_KIND(unicode);
9067 data = PyUnicode_DATA(unicode);
9068
Victor Stinnerb84d7232011-11-22 01:50:07 +01009069 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009070 PyObject *exc;
9071 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009073 Py_ssize_t startpos;
9074
9075 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009076
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009078 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009079 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 decimal = Py_UNICODE_TODECIMAL(ch);
9083 if (decimal >= 0) {
9084 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009085 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 continue;
9087 }
9088 if (0 < ch && ch < 256) {
9089 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009090 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 continue;
9092 }
Victor Stinner6345be92011-11-25 20:09:01 +01009093
Victor Stinner42bf7752011-11-21 22:52:58 +01009094 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009095 exc = NULL;
9096 raise_encode_exception(&exc, "decimal", unicode,
9097 startpos, startpos+1,
9098 "invalid decimal Unicode string");
9099 Py_XDECREF(exc);
9100 Py_DECREF(unicode);
9101 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009102 }
9103 /* 0-terminate the output string */
9104 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009105 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009106 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009107}
9108
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109/* --- Helpers ------------------------------------------------------------ */
9110
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009111/* helper macro to fixup start/end slice values */
9112#define ADJUST_INDICES(start, end, len) \
9113 if (end > len) \
9114 end = len; \
9115 else if (end < 0) { \
9116 end += len; \
9117 if (end < 0) \
9118 end = 0; \
9119 } \
9120 if (start < 0) { \
9121 start += len; \
9122 if (start < 0) \
9123 start = 0; \
9124 }
9125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009127any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 Py_ssize_t start,
9129 Py_ssize_t end)
9130{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009131 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 void *buf1, *buf2;
9133 Py_ssize_t len1, len2, result;
9134
9135 kind1 = PyUnicode_KIND(s1);
9136 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009137 if (kind1 < kind2)
9138 return -1;
9139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 len1 = PyUnicode_GET_LENGTH(s1);
9141 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009142 ADJUST_INDICES(start, end, len1);
9143 if (end - start < len2)
9144 return -1;
9145
9146 buf1 = PyUnicode_DATA(s1);
9147 buf2 = PyUnicode_DATA(s2);
9148 if (len2 == 1) {
9149 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9150 result = findchar((const char *)buf1 + kind1*start,
9151 kind1, end - start, ch, direction);
9152 if (result == -1)
9153 return -1;
9154 else
9155 return start + result;
9156 }
9157
9158 if (kind2 != kind1) {
9159 buf2 = _PyUnicode_AsKind(s2, kind1);
9160 if (!buf2)
9161 return -2;
9162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163
Victor Stinner794d5672011-10-10 03:21:36 +02009164 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009165 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009166 case PyUnicode_1BYTE_KIND:
9167 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9168 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9169 else
9170 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9171 break;
9172 case PyUnicode_2BYTE_KIND:
9173 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9174 break;
9175 case PyUnicode_4BYTE_KIND:
9176 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9177 break;
9178 default:
9179 assert(0); result = -2;
9180 }
9181 }
9182 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009183 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009184 case PyUnicode_1BYTE_KIND:
9185 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9186 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9187 else
9188 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9189 break;
9190 case PyUnicode_2BYTE_KIND:
9191 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9192 break;
9193 case PyUnicode_4BYTE_KIND:
9194 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9195 break;
9196 default:
9197 assert(0); result = -2;
9198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 }
9200
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009201 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 PyMem_Free(buf2);
9203
9204 return result;
9205}
9206
9207Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009208_PyUnicode_InsertThousandsGrouping(
9209 PyObject *unicode, Py_ssize_t index,
9210 Py_ssize_t n_buffer,
9211 void *digits, Py_ssize_t n_digits,
9212 Py_ssize_t min_width,
9213 const char *grouping, PyObject *thousands_sep,
9214 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215{
Victor Stinner41a863c2012-02-24 00:37:51 +01009216 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009217 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009218 Py_ssize_t thousands_sep_len;
9219 Py_ssize_t len;
9220
9221 if (unicode != NULL) {
9222 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009223 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009224 }
9225 else {
9226 kind = PyUnicode_1BYTE_KIND;
9227 data = NULL;
9228 }
9229 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9230 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9231 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9232 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009233 if (thousands_sep_kind < kind) {
9234 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9235 if (!thousands_sep_data)
9236 return -1;
9237 }
9238 else {
9239 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9240 if (!data)
9241 return -1;
9242 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009243 }
9244
Benjamin Petersonead6b532011-12-20 17:23:42 -06009245 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009247 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009248 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009249 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009250 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009251 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009252 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009253 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009254 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009255 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009256 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009257 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009259 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009260 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009261 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009262 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009263 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009265 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009266 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009268 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009269 break;
9270 default:
9271 assert(0);
9272 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009274 if (unicode != NULL && thousands_sep_kind != kind) {
9275 if (thousands_sep_kind < kind)
9276 PyMem_Free(thousands_sep_data);
9277 else
9278 PyMem_Free(data);
9279 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009280 if (unicode == NULL) {
9281 *maxchar = 127;
9282 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009283 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009284 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009285 }
9286 }
9287 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288}
9289
9290
Alexander Belopolsky40018472011-02-26 01:02:56 +00009291Py_ssize_t
9292PyUnicode_Count(PyObject *str,
9293 PyObject *substr,
9294 Py_ssize_t start,
9295 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009297 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009298 PyObject* str_obj;
9299 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009300 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 void *buf1 = NULL, *buf2 = NULL;
9302 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009303
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009304 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009305 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009307 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009308 if (!sub_obj) {
9309 Py_DECREF(str_obj);
9310 return -1;
9311 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009312 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009313 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009314 Py_DECREF(str_obj);
9315 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316 }
Tim Petersced69f82003-09-16 20:30:58 +00009317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 kind1 = PyUnicode_KIND(str_obj);
9319 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009320 if (kind1 < kind2) {
9321 Py_DECREF(sub_obj);
9322 Py_DECREF(str_obj);
9323 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009324 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 len1 = PyUnicode_GET_LENGTH(str_obj);
9327 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009329 if (end - start < len2) {
9330 Py_DECREF(sub_obj);
9331 Py_DECREF(str_obj);
9332 return 0;
9333 }
9334
9335 buf1 = PyUnicode_DATA(str_obj);
9336 buf2 = PyUnicode_DATA(sub_obj);
9337 if (kind2 != kind1) {
9338 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9339 if (!buf2)
9340 goto onError;
9341 }
9342
9343 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009345 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9346 result = asciilib_count(
9347 ((Py_UCS1*)buf1) + start, end - start,
9348 buf2, len2, PY_SSIZE_T_MAX
9349 );
9350 else
9351 result = ucs1lib_count(
9352 ((Py_UCS1*)buf1) + start, end - start,
9353 buf2, len2, PY_SSIZE_T_MAX
9354 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 break;
9356 case PyUnicode_2BYTE_KIND:
9357 result = ucs2lib_count(
9358 ((Py_UCS2*)buf1) + start, end - start,
9359 buf2, len2, PY_SSIZE_T_MAX
9360 );
9361 break;
9362 case PyUnicode_4BYTE_KIND:
9363 result = ucs4lib_count(
9364 ((Py_UCS4*)buf1) + start, end - start,
9365 buf2, len2, PY_SSIZE_T_MAX
9366 );
9367 break;
9368 default:
9369 assert(0); result = 0;
9370 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009371
9372 Py_DECREF(sub_obj);
9373 Py_DECREF(str_obj);
9374
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009375 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 PyMem_Free(buf2);
9377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 onError:
9380 Py_DECREF(sub_obj);
9381 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009382 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 PyMem_Free(buf2);
9384 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385}
9386
Alexander Belopolsky40018472011-02-26 01:02:56 +00009387Py_ssize_t
9388PyUnicode_Find(PyObject *str,
9389 PyObject *sub,
9390 Py_ssize_t start,
9391 Py_ssize_t end,
9392 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009394 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009395
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009397 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009399 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009400 if (!sub) {
9401 Py_DECREF(str);
9402 return -2;
9403 }
9404 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9405 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 Py_DECREF(str);
9407 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 }
Tim Petersced69f82003-09-16 20:30:58 +00009409
Victor Stinner794d5672011-10-10 03:21:36 +02009410 result = any_find_slice(direction,
9411 str, sub, start, end
9412 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009413
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009415 Py_DECREF(sub);
9416
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417 return result;
9418}
9419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420Py_ssize_t
9421PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9422 Py_ssize_t start, Py_ssize_t end,
9423 int direction)
9424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009426 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 if (PyUnicode_READY(str) == -1)
9428 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009429 if (start < 0 || end < 0) {
9430 PyErr_SetString(PyExc_IndexError, "string index out of range");
9431 return -2;
9432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 if (end > PyUnicode_GET_LENGTH(str))
9434 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009435 if (start >= end)
9436 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009438 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9439 kind, end-start, ch, direction);
9440 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009442 else
9443 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444}
9445
Alexander Belopolsky40018472011-02-26 01:02:56 +00009446static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009447tailmatch(PyObject *self,
9448 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009449 Py_ssize_t start,
9450 Py_ssize_t end,
9451 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 int kind_self;
9454 int kind_sub;
9455 void *data_self;
9456 void *data_sub;
9457 Py_ssize_t offset;
9458 Py_ssize_t i;
9459 Py_ssize_t end_sub;
9460
9461 if (PyUnicode_READY(self) == -1 ||
9462 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009463 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9466 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009470 if (PyUnicode_GET_LENGTH(substring) == 0)
9471 return 1;
9472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 kind_self = PyUnicode_KIND(self);
9474 data_self = PyUnicode_DATA(self);
9475 kind_sub = PyUnicode_KIND(substring);
9476 data_sub = PyUnicode_DATA(substring);
9477 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9478
9479 if (direction > 0)
9480 offset = end;
9481 else
9482 offset = start;
9483
9484 if (PyUnicode_READ(kind_self, data_self, offset) ==
9485 PyUnicode_READ(kind_sub, data_sub, 0) &&
9486 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9487 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9488 /* If both are of the same kind, memcmp is sufficient */
9489 if (kind_self == kind_sub) {
9490 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009491 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 data_sub,
9493 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009494 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 }
9496 /* otherwise we have to compare each character by first accesing it */
9497 else {
9498 /* We do not need to compare 0 and len(substring)-1 because
9499 the if statement above ensured already that they are equal
9500 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 for (i = 1; i < end_sub; ++i) {
9502 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9503 PyUnicode_READ(kind_sub, data_sub, i))
9504 return 0;
9505 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 }
9509
9510 return 0;
9511}
9512
Alexander Belopolsky40018472011-02-26 01:02:56 +00009513Py_ssize_t
9514PyUnicode_Tailmatch(PyObject *str,
9515 PyObject *substr,
9516 Py_ssize_t start,
9517 Py_ssize_t end,
9518 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009520 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009521
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 str = PyUnicode_FromObject(str);
9523 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 substr = PyUnicode_FromObject(substr);
9526 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 Py_DECREF(str);
9528 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 }
Tim Petersced69f82003-09-16 20:30:58 +00009530
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009531 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 Py_DECREF(str);
9534 Py_DECREF(substr);
9535 return result;
9536}
9537
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538/* Apply fixfct filter to the Unicode object self and return a
9539 reference to the modified object */
9540
Alexander Belopolsky40018472011-02-26 01:02:56 +00009541static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009542fixup(PyObject *self,
9543 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 PyObject *u;
9546 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009547 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009549 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009552 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 /* fix functions return the new maximum character in a string,
9555 if the kind of the resulting unicode object does not change,
9556 everything is fine. Otherwise we need to change the string kind
9557 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009558 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009559
9560 if (maxchar_new == 0) {
9561 /* no changes */;
9562 if (PyUnicode_CheckExact(self)) {
9563 Py_DECREF(u);
9564 Py_INCREF(self);
9565 return self;
9566 }
9567 else
9568 return u;
9569 }
9570
Victor Stinnere6abb482012-05-02 01:15:40 +02009571 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572
Victor Stinnereaab6042011-12-11 22:22:39 +01009573 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009575
9576 /* In case the maximum character changed, we need to
9577 convert the string to the new category. */
9578 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9579 if (v == NULL) {
9580 Py_DECREF(u);
9581 return NULL;
9582 }
9583 if (maxchar_new > maxchar_old) {
9584 /* If the maxchar increased so that the kind changed, not all
9585 characters are representable anymore and we need to fix the
9586 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009587 _PyUnicode_FastCopyCharacters(v, 0,
9588 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009589 maxchar_old = fixfct(v);
9590 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 }
9592 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009593 _PyUnicode_FastCopyCharacters(v, 0,
9594 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009596 Py_DECREF(u);
9597 assert(_PyUnicode_CheckConsistency(v, 1));
9598 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599}
9600
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009601static PyObject *
9602ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009604 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9605 char *resdata, *data = PyUnicode_DATA(self);
9606 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009607
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009608 res = PyUnicode_New(len, 127);
9609 if (res == NULL)
9610 return NULL;
9611 resdata = PyUnicode_DATA(res);
9612 if (lower)
9613 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009615 _Py_bytes_upper(resdata, data, len);
9616 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617}
9618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009620handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009622 Py_ssize_t j;
9623 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009624 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009625 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009626
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009627 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9628
9629 where ! is a negation and \p{xxx} is a character with property xxx.
9630 */
9631 for (j = i - 1; j >= 0; j--) {
9632 c = PyUnicode_READ(kind, data, j);
9633 if (!_PyUnicode_IsCaseIgnorable(c))
9634 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009636 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9637 if (final_sigma) {
9638 for (j = i + 1; j < length; j++) {
9639 c = PyUnicode_READ(kind, data, j);
9640 if (!_PyUnicode_IsCaseIgnorable(c))
9641 break;
9642 }
9643 final_sigma = j == length || !_PyUnicode_IsCased(c);
9644 }
9645 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646}
9647
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648static int
9649lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9650 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 /* Obscure special case. */
9653 if (c == 0x3A3) {
9654 mapped[0] = handle_capital_sigma(kind, data, length, i);
9655 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658}
9659
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660static Py_ssize_t
9661do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 Py_ssize_t i, k = 0;
9664 int n_res, j;
9665 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009666
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 c = PyUnicode_READ(kind, data, 0);
9668 n_res = _PyUnicode_ToUpperFull(c, mapped);
9669 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009670 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 for (i = 1; i < length; i++) {
9674 c = PyUnicode_READ(kind, data, i);
9675 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9676 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009677 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009679 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009680 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682}
9683
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684static Py_ssize_t
9685do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9686 Py_ssize_t i, k = 0;
9687
9688 for (i = 0; i < length; i++) {
9689 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9690 int n_res, j;
9691 if (Py_UNICODE_ISUPPER(c)) {
9692 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9693 }
9694 else if (Py_UNICODE_ISLOWER(c)) {
9695 n_res = _PyUnicode_ToUpperFull(c, mapped);
9696 }
9697 else {
9698 n_res = 1;
9699 mapped[0] = c;
9700 }
9701 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009702 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 res[k++] = mapped[j];
9704 }
9705 }
9706 return k;
9707}
9708
9709static Py_ssize_t
9710do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9711 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 Py_ssize_t i, k = 0;
9714
9715 for (i = 0; i < length; i++) {
9716 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9717 int n_res, j;
9718 if (lower)
9719 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9720 else
9721 n_res = _PyUnicode_ToUpperFull(c, mapped);
9722 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009723 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724 res[k++] = mapped[j];
9725 }
9726 }
9727 return k;
9728}
9729
9730static Py_ssize_t
9731do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9732{
9733 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9734}
9735
9736static Py_ssize_t
9737do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9738{
9739 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9740}
9741
Benjamin Petersone51757f2012-01-12 21:10:29 -05009742static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009743do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9744{
9745 Py_ssize_t i, k = 0;
9746
9747 for (i = 0; i < length; i++) {
9748 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9749 Py_UCS4 mapped[3];
9750 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9751 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009752 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009753 res[k++] = mapped[j];
9754 }
9755 }
9756 return k;
9757}
9758
9759static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009760do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9761{
9762 Py_ssize_t i, k = 0;
9763 int previous_is_cased;
9764
9765 previous_is_cased = 0;
9766 for (i = 0; i < length; i++) {
9767 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9768 Py_UCS4 mapped[3];
9769 int n_res, j;
9770
9771 if (previous_is_cased)
9772 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9773 else
9774 n_res = _PyUnicode_ToTitleFull(c, mapped);
9775
9776 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009777 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009778 res[k++] = mapped[j];
9779 }
9780
9781 previous_is_cased = _PyUnicode_IsCased(c);
9782 }
9783 return k;
9784}
9785
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009786static PyObject *
9787case_operation(PyObject *self,
9788 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9789{
9790 PyObject *res = NULL;
9791 Py_ssize_t length, newlength = 0;
9792 int kind, outkind;
9793 void *data, *outdata;
9794 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9795
Benjamin Petersoneea48462012-01-16 14:28:50 -05009796 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009797
9798 kind = PyUnicode_KIND(self);
9799 data = PyUnicode_DATA(self);
9800 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009801 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009802 PyErr_SetString(PyExc_OverflowError, "string is too long");
9803 return NULL;
9804 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009805 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009806 if (tmp == NULL)
9807 return PyErr_NoMemory();
9808 newlength = perform(kind, data, length, tmp, &maxchar);
9809 res = PyUnicode_New(newlength, maxchar);
9810 if (res == NULL)
9811 goto leave;
9812 tmpend = tmp + newlength;
9813 outdata = PyUnicode_DATA(res);
9814 outkind = PyUnicode_KIND(res);
9815 switch (outkind) {
9816 case PyUnicode_1BYTE_KIND:
9817 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9818 break;
9819 case PyUnicode_2BYTE_KIND:
9820 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9821 break;
9822 case PyUnicode_4BYTE_KIND:
9823 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9824 break;
9825 default:
9826 assert(0);
9827 break;
9828 }
9829 leave:
9830 PyMem_FREE(tmp);
9831 return res;
9832}
9833
Tim Peters8ce9f162004-08-27 01:49:32 +00009834PyObject *
9835PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009838 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009840 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009841 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9842 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009843 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009845 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009847 int use_memcpy;
9848 unsigned char *res_data = NULL, *sep_data = NULL;
9849 PyObject *last_obj;
9850 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009852 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009853 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009855 }
9856
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009857 /* NOTE: the following code can't call back into Python code,
9858 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009859 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009860
Tim Peters05eba1f2004-08-27 21:32:02 +00009861 seqlen = PySequence_Fast_GET_SIZE(fseq);
9862 /* If empty sequence, return u"". */
9863 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009864 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009865 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009867
Tim Peters05eba1f2004-08-27 21:32:02 +00009868 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009869 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009870 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009871 if (seqlen == 1) {
9872 if (PyUnicode_CheckExact(items[0])) {
9873 res = items[0];
9874 Py_INCREF(res);
9875 Py_DECREF(fseq);
9876 return res;
9877 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009878 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009879 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009880 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009881 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009882 /* Set up sep and seplen */
9883 if (separator == NULL) {
9884 /* fall back to a blank space separator */
9885 sep = PyUnicode_FromOrdinal(' ');
9886 if (!sep)
9887 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009888 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009889 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009890 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009891 else {
9892 if (!PyUnicode_Check(separator)) {
9893 PyErr_Format(PyExc_TypeError,
9894 "separator: expected str instance,"
9895 " %.80s found",
9896 Py_TYPE(separator)->tp_name);
9897 goto onError;
9898 }
9899 if (PyUnicode_READY(separator))
9900 goto onError;
9901 sep = separator;
9902 seplen = PyUnicode_GET_LENGTH(separator);
9903 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9904 /* inc refcount to keep this code path symmetric with the
9905 above case of a blank separator */
9906 Py_INCREF(sep);
9907 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009908 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009909 }
9910
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009911 /* There are at least two things to join, or else we have a subclass
9912 * of str in the sequence.
9913 * Do a pre-pass to figure out the total amount of space we'll
9914 * need (sz), and see whether all argument are strings.
9915 */
9916 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009917#ifdef Py_DEBUG
9918 use_memcpy = 0;
9919#else
9920 use_memcpy = 1;
9921#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009922 for (i = 0; i < seqlen; i++) {
9923 const Py_ssize_t old_sz = sz;
9924 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 if (!PyUnicode_Check(item)) {
9926 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009927 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 " %.80s found",
9929 i, Py_TYPE(item)->tp_name);
9930 goto onError;
9931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 if (PyUnicode_READY(item) == -1)
9933 goto onError;
9934 sz += PyUnicode_GET_LENGTH(item);
9935 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009936 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009937 if (i != 0)
9938 sz += seplen;
9939 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9940 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009941 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 goto onError;
9943 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009944 if (use_memcpy && last_obj != NULL) {
9945 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9946 use_memcpy = 0;
9947 }
9948 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009949 }
Tim Petersced69f82003-09-16 20:30:58 +00009950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009952 if (res == NULL)
9953 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009954
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009955 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009956#ifdef Py_DEBUG
9957 use_memcpy = 0;
9958#else
9959 if (use_memcpy) {
9960 res_data = PyUnicode_1BYTE_DATA(res);
9961 kind = PyUnicode_KIND(res);
9962 if (seplen != 0)
9963 sep_data = PyUnicode_1BYTE_DATA(sep);
9964 }
9965#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009966 if (use_memcpy) {
9967 for (i = 0; i < seqlen; ++i) {
9968 Py_ssize_t itemlen;
9969 item = items[i];
9970
9971 /* Copy item, and maybe the separator. */
9972 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009973 Py_MEMCPY(res_data,
9974 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009975 kind * seplen);
9976 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009977 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009978
9979 itemlen = PyUnicode_GET_LENGTH(item);
9980 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009981 Py_MEMCPY(res_data,
9982 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009983 kind * itemlen);
9984 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009985 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009986 }
9987 assert(res_data == PyUnicode_1BYTE_DATA(res)
9988 + kind * PyUnicode_GET_LENGTH(res));
9989 }
9990 else {
9991 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9992 Py_ssize_t itemlen;
9993 item = items[i];
9994
9995 /* Copy item, and maybe the separator. */
9996 if (i && seplen != 0) {
9997 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9998 res_offset += seplen;
9999 }
10000
10001 itemlen = PyUnicode_GET_LENGTH(item);
10002 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010003 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 res_offset += itemlen;
10005 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010006 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010008 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010009
Tim Peters05eba1f2004-08-27 21:32:02 +000010010 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010012 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010016 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010018 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019 return NULL;
10020}
10021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022#define FILL(kind, data, value, start, length) \
10023 do { \
10024 Py_ssize_t i_ = 0; \
10025 assert(kind != PyUnicode_WCHAR_KIND); \
10026 switch ((kind)) { \
10027 case PyUnicode_1BYTE_KIND: { \
10028 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010029 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 break; \
10031 } \
10032 case PyUnicode_2BYTE_KIND: { \
10033 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10034 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10035 break; \
10036 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010037 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10039 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10040 break; \
10041 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010042 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 } \
10044 } while (0)
10045
Victor Stinnerd3f08822012-05-29 12:57:52 +020010046void
10047_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10048 Py_UCS4 fill_char)
10049{
10050 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10051 const void *data = PyUnicode_DATA(unicode);
10052 assert(PyUnicode_IS_READY(unicode));
10053 assert(unicode_modifiable(unicode));
10054 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10055 assert(start >= 0);
10056 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10057 FILL(kind, data, fill_char, start, length);
10058}
10059
Victor Stinner3fe55312012-01-04 00:33:50 +010010060Py_ssize_t
10061PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10062 Py_UCS4 fill_char)
10063{
10064 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010065
10066 if (!PyUnicode_Check(unicode)) {
10067 PyErr_BadInternalCall();
10068 return -1;
10069 }
10070 if (PyUnicode_READY(unicode) == -1)
10071 return -1;
10072 if (unicode_check_modifiable(unicode))
10073 return -1;
10074
Victor Stinnerd3f08822012-05-29 12:57:52 +020010075 if (start < 0) {
10076 PyErr_SetString(PyExc_IndexError, "string index out of range");
10077 return -1;
10078 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010079 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10080 PyErr_SetString(PyExc_ValueError,
10081 "fill character is bigger than "
10082 "the string maximum character");
10083 return -1;
10084 }
10085
10086 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10087 length = Py_MIN(maxlen, length);
10088 if (length <= 0)
10089 return 0;
10090
Victor Stinnerd3f08822012-05-29 12:57:52 +020010091 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010092 return length;
10093}
10094
Victor Stinner9310abb2011-10-05 00:59:23 +020010095static PyObject *
10096pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010097 Py_ssize_t left,
10098 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 PyObject *u;
10102 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010103 int kind;
10104 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105
10106 if (left < 0)
10107 left = 0;
10108 if (right < 0)
10109 right = 0;
10110
Victor Stinnerc4b49542011-12-11 22:44:26 +010010111 if (left == 0 && right == 0)
10112 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10115 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010116 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10117 return NULL;
10118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010120 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010122 if (!u)
10123 return NULL;
10124
10125 kind = PyUnicode_KIND(u);
10126 data = PyUnicode_DATA(u);
10127 if (left)
10128 FILL(kind, data, fill, 0, left);
10129 if (right)
10130 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010131 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010132 assert(_PyUnicode_CheckConsistency(u, 1));
10133 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134}
10135
Alexander Belopolsky40018472011-02-26 01:02:56 +000010136PyObject *
10137PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
10141 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010142 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010144 if (PyUnicode_READY(string) == -1) {
10145 Py_DECREF(string);
10146 return NULL;
10147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
Benjamin Petersonead6b532011-12-20 17:23:42 -060010149 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010151 if (PyUnicode_IS_ASCII(string))
10152 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010153 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010154 PyUnicode_GET_LENGTH(string), keepends);
10155 else
10156 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010157 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010158 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 break;
10160 case PyUnicode_2BYTE_KIND:
10161 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010162 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 PyUnicode_GET_LENGTH(string), keepends);
10164 break;
10165 case PyUnicode_4BYTE_KIND:
10166 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 PyUnicode_GET_LENGTH(string), keepends);
10169 break;
10170 default:
10171 assert(0);
10172 list = 0;
10173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 Py_DECREF(string);
10175 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176}
10177
Alexander Belopolsky40018472011-02-26 01:02:56 +000010178static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010179split(PyObject *self,
10180 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010181 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010183 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 void *buf1, *buf2;
10185 Py_ssize_t len1, len2;
10186 PyObject* out;
10187
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010189 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 if (PyUnicode_READY(self) == -1)
10192 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010195 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010197 if (PyUnicode_IS_ASCII(self))
10198 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010200 PyUnicode_GET_LENGTH(self), maxcount
10201 );
10202 else
10203 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010204 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010205 PyUnicode_GET_LENGTH(self), maxcount
10206 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 case PyUnicode_2BYTE_KIND:
10208 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010209 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 PyUnicode_GET_LENGTH(self), maxcount
10211 );
10212 case PyUnicode_4BYTE_KIND:
10213 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010214 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 PyUnicode_GET_LENGTH(self), maxcount
10216 );
10217 default:
10218 assert(0);
10219 return NULL;
10220 }
10221
10222 if (PyUnicode_READY(substring) == -1)
10223 return NULL;
10224
10225 kind1 = PyUnicode_KIND(self);
10226 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 len1 = PyUnicode_GET_LENGTH(self);
10228 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010229 if (kind1 < kind2 || len1 < len2) {
10230 out = PyList_New(1);
10231 if (out == NULL)
10232 return NULL;
10233 Py_INCREF(self);
10234 PyList_SET_ITEM(out, 0, self);
10235 return out;
10236 }
10237 buf1 = PyUnicode_DATA(self);
10238 buf2 = PyUnicode_DATA(substring);
10239 if (kind2 != kind1) {
10240 buf2 = _PyUnicode_AsKind(substring, kind1);
10241 if (!buf2)
10242 return NULL;
10243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010245 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010247 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10248 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010250 else
10251 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 break;
10254 case PyUnicode_2BYTE_KIND:
10255 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010256 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 break;
10258 case PyUnicode_4BYTE_KIND:
10259 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010260 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 break;
10262 default:
10263 out = NULL;
10264 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010265 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 PyMem_Free(buf2);
10267 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268}
10269
Alexander Belopolsky40018472011-02-26 01:02:56 +000010270static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010271rsplit(PyObject *self,
10272 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010273 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010274{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010275 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 void *buf1, *buf2;
10277 Py_ssize_t len1, len2;
10278 PyObject* out;
10279
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010280 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010281 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 if (PyUnicode_READY(self) == -1)
10284 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010287 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010289 if (PyUnicode_IS_ASCII(self))
10290 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010292 PyUnicode_GET_LENGTH(self), maxcount
10293 );
10294 else
10295 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010296 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010297 PyUnicode_GET_LENGTH(self), maxcount
10298 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 case PyUnicode_2BYTE_KIND:
10300 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 PyUnicode_GET_LENGTH(self), maxcount
10303 );
10304 case PyUnicode_4BYTE_KIND:
10305 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010306 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 PyUnicode_GET_LENGTH(self), maxcount
10308 );
10309 default:
10310 assert(0);
10311 return NULL;
10312 }
10313
10314 if (PyUnicode_READY(substring) == -1)
10315 return NULL;
10316
10317 kind1 = PyUnicode_KIND(self);
10318 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 len1 = PyUnicode_GET_LENGTH(self);
10320 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010321 if (kind1 < kind2 || len1 < len2) {
10322 out = PyList_New(1);
10323 if (out == NULL)
10324 return NULL;
10325 Py_INCREF(self);
10326 PyList_SET_ITEM(out, 0, self);
10327 return out;
10328 }
10329 buf1 = PyUnicode_DATA(self);
10330 buf2 = PyUnicode_DATA(substring);
10331 if (kind2 != kind1) {
10332 buf2 = _PyUnicode_AsKind(substring, kind1);
10333 if (!buf2)
10334 return NULL;
10335 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010337 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010339 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10340 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010342 else
10343 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 break;
10346 case PyUnicode_2BYTE_KIND:
10347 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 break;
10350 case PyUnicode_4BYTE_KIND:
10351 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010352 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 break;
10354 default:
10355 out = NULL;
10356 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010357 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 PyMem_Free(buf2);
10359 return out;
10360}
10361
10362static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010363anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10364 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010366 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010368 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10369 return asciilib_find(buf1, len1, buf2, len2, offset);
10370 else
10371 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 case PyUnicode_2BYTE_KIND:
10373 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10374 case PyUnicode_4BYTE_KIND:
10375 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10376 }
10377 assert(0);
10378 return -1;
10379}
10380
10381static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10383 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010385 switch (kind) {
10386 case PyUnicode_1BYTE_KIND:
10387 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10388 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10389 else
10390 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10391 case PyUnicode_2BYTE_KIND:
10392 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10393 case PyUnicode_4BYTE_KIND:
10394 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10395 }
10396 assert(0);
10397 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010398}
10399
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010400static void
10401replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10402 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10403{
10404 int kind = PyUnicode_KIND(u);
10405 void *data = PyUnicode_DATA(u);
10406 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10407 if (kind == PyUnicode_1BYTE_KIND) {
10408 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10409 (Py_UCS1 *)data + len,
10410 u1, u2, maxcount);
10411 }
10412 else if (kind == PyUnicode_2BYTE_KIND) {
10413 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10414 (Py_UCS2 *)data + len,
10415 u1, u2, maxcount);
10416 }
10417 else {
10418 assert(kind == PyUnicode_4BYTE_KIND);
10419 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10420 (Py_UCS4 *)data + len,
10421 u1, u2, maxcount);
10422 }
10423}
10424
Alexander Belopolsky40018472011-02-26 01:02:56 +000010425static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426replace(PyObject *self, PyObject *str1,
10427 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 PyObject *u;
10430 char *sbuf = PyUnicode_DATA(self);
10431 char *buf1 = PyUnicode_DATA(str1);
10432 char *buf2 = PyUnicode_DATA(str2);
10433 int srelease = 0, release1 = 0, release2 = 0;
10434 int skind = PyUnicode_KIND(self);
10435 int kind1 = PyUnicode_KIND(str1);
10436 int kind2 = PyUnicode_KIND(str2);
10437 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10438 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10439 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010440 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010441 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442
10443 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010444 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010446 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447
Victor Stinner59de0ee2011-10-07 10:01:28 +020010448 if (str1 == str2)
10449 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450
Victor Stinner49a0a212011-10-12 23:46:10 +020010451 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010452 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10453 if (maxchar < maxchar_str1)
10454 /* substring too wide to be present */
10455 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010456 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10457 /* Replacing str1 with str2 may cause a maxchar reduction in the
10458 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010459 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010460 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010465 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010468 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010469 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010470
Victor Stinner69ed0f42013-04-09 21:48:24 +020010471 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010472 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010473 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010474 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010475 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010479
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010480 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10481 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010482 }
10483 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 int rkind = skind;
10485 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010486 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 if (kind1 < rkind) {
10489 /* widen substring */
10490 buf1 = _PyUnicode_AsKind(str1, rkind);
10491 if (!buf1) goto error;
10492 release1 = 1;
10493 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010494 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010495 if (i < 0)
10496 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (rkind > kind2) {
10498 /* widen replacement */
10499 buf2 = _PyUnicode_AsKind(str2, rkind);
10500 if (!buf2) goto error;
10501 release2 = 1;
10502 }
10503 else if (rkind < kind2) {
10504 /* widen self and buf1 */
10505 rkind = kind2;
10506 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010507 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 sbuf = _PyUnicode_AsKind(self, rkind);
10509 if (!sbuf) goto error;
10510 srelease = 1;
10511 buf1 = _PyUnicode_AsKind(str1, rkind);
10512 if (!buf1) goto error;
10513 release1 = 1;
10514 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010515 u = PyUnicode_New(slen, maxchar);
10516 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010518 assert(PyUnicode_KIND(u) == rkind);
10519 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010520
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010521 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010522 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010525 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010527
10528 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010529 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010530 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010531 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010532 if (i == -1)
10533 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010534 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010536 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 }
10541 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010543 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 int rkind = skind;
10545 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010548 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 buf1 = _PyUnicode_AsKind(str1, rkind);
10550 if (!buf1) goto error;
10551 release1 = 1;
10552 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010553 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010554 if (n == 0)
10555 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010557 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 buf2 = _PyUnicode_AsKind(str2, rkind);
10559 if (!buf2) goto error;
10560 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010563 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 rkind = kind2;
10565 sbuf = _PyUnicode_AsKind(self, rkind);
10566 if (!sbuf) goto error;
10567 srelease = 1;
10568 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010569 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 buf1 = _PyUnicode_AsKind(str1, rkind);
10571 if (!buf1) goto error;
10572 release1 = 1;
10573 }
10574 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10575 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010576 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 PyErr_SetString(PyExc_OverflowError,
10578 "replace string is too long");
10579 goto error;
10580 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010581 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010583 _Py_INCREF_UNICODE_EMPTY();
10584 if (!unicode_empty)
10585 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 u = unicode_empty;
10587 goto done;
10588 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010589 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 PyErr_SetString(PyExc_OverflowError,
10591 "replace string is too long");
10592 goto error;
10593 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010594 u = PyUnicode_New(new_size, maxchar);
10595 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 assert(PyUnicode_KIND(u) == rkind);
10598 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 ires = i = 0;
10600 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 while (n-- > 0) {
10602 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010603 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010604 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010605 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010606 if (j == -1)
10607 break;
10608 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010610 memcpy(res + rkind * ires,
10611 sbuf + rkind * i,
10612 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 }
10615 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010619 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 memcpy(res + rkind * ires,
10627 sbuf + rkind * i,
10628 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010629 }
10630 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 /* interleave */
10632 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010633 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010635 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 if (--n <= 0)
10638 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 memcpy(res + rkind * ires,
10640 sbuf + rkind * i,
10641 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 ires++;
10643 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010645 memcpy(res + rkind * ires,
10646 sbuf + rkind * i,
10647 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010649 }
10650
10651 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010652 unicode_adjust_maxchar(&u);
10653 if (u == NULL)
10654 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010656
10657 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (srelease)
10659 PyMem_FREE(sbuf);
10660 if (release1)
10661 PyMem_FREE(buf1);
10662 if (release2)
10663 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010664 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 if (srelease)
10670 PyMem_FREE(sbuf);
10671 if (release1)
10672 PyMem_FREE(buf1);
10673 if (release2)
10674 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010675 return unicode_result_unchanged(self);
10676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 error:
10678 if (srelease && sbuf)
10679 PyMem_FREE(sbuf);
10680 if (release1 && buf1)
10681 PyMem_FREE(buf1);
10682 if (release2 && buf2)
10683 PyMem_FREE(buf2);
10684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685}
10686
10687/* --- Unicode Object Methods --------------------------------------------- */
10688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
10692Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010693characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
10695static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010696unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010698 if (PyUnicode_READY(self) == -1)
10699 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010700 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701}
10702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010703PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010704 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705\n\
10706Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010707have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
10709static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010710unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010712 if (PyUnicode_READY(self) == -1)
10713 return NULL;
10714 if (PyUnicode_GET_LENGTH(self) == 0)
10715 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010716 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717}
10718
Benjamin Petersond5890c82012-01-14 13:23:30 -050010719PyDoc_STRVAR(casefold__doc__,
10720 "S.casefold() -> str\n\
10721\n\
10722Return a version of S suitable for caseless comparisons.");
10723
10724static PyObject *
10725unicode_casefold(PyObject *self)
10726{
10727 if (PyUnicode_READY(self) == -1)
10728 return NULL;
10729 if (PyUnicode_IS_ASCII(self))
10730 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010731 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010732}
10733
10734
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010735/* Argument converter. Coerces to a single unicode character */
10736
10737static int
10738convert_uc(PyObject *obj, void *addr)
10739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010741 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010742
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 uniobj = PyUnicode_FromObject(obj);
10744 if (uniobj == NULL) {
10745 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010747 return 0;
10748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010752 Py_DECREF(uniobj);
10753 return 0;
10754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010756 Py_DECREF(uniobj);
10757 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010758}
10759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010760PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010763Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010764done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
10766static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010767unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010769 Py_ssize_t marg, left;
10770 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 Py_UCS4 fillchar = ' ';
10772
Victor Stinnere9a29352011-10-01 02:14:59 +020010773 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775
Benjamin Petersonbac79492012-01-14 13:34:47 -050010776 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 return NULL;
10778
Victor Stinnerc4b49542011-12-11 22:44:26 +010010779 if (PyUnicode_GET_LENGTH(self) >= width)
10780 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Victor Stinnerc4b49542011-12-11 22:44:26 +010010782 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783 left = marg / 2 + (marg & width & 1);
10784
Victor Stinner9310abb2011-10-05 00:59:23 +020010785 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786}
10787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788/* This function assumes that str1 and str2 are readied by the caller. */
10789
Marc-André Lemburge5034372000-08-08 08:04:29 +000010790static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010791unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010792{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010793#define COMPARE(TYPE1, TYPE2) \
10794 do { \
10795 TYPE1* p1 = (TYPE1 *)data1; \
10796 TYPE2* p2 = (TYPE2 *)data2; \
10797 TYPE1* end = p1 + len; \
10798 Py_UCS4 c1, c2; \
10799 for (; p1 != end; p1++, p2++) { \
10800 c1 = *p1; \
10801 c2 = *p2; \
10802 if (c1 != c2) \
10803 return (c1 < c2) ? -1 : 1; \
10804 } \
10805 } \
10806 while (0)
10807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 int kind1, kind2;
10809 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010810 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 kind1 = PyUnicode_KIND(str1);
10813 kind2 = PyUnicode_KIND(str2);
10814 data1 = PyUnicode_DATA(str1);
10815 data2 = PyUnicode_DATA(str2);
10816 len1 = PyUnicode_GET_LENGTH(str1);
10817 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010818 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010819
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010820 switch(kind1) {
10821 case PyUnicode_1BYTE_KIND:
10822 {
10823 switch(kind2) {
10824 case PyUnicode_1BYTE_KIND:
10825 {
10826 int cmp = memcmp(data1, data2, len);
10827 /* normalize result of memcmp() into the range [-1; 1] */
10828 if (cmp < 0)
10829 return -1;
10830 if (cmp > 0)
10831 return 1;
10832 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010833 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010834 case PyUnicode_2BYTE_KIND:
10835 COMPARE(Py_UCS1, Py_UCS2);
10836 break;
10837 case PyUnicode_4BYTE_KIND:
10838 COMPARE(Py_UCS1, Py_UCS4);
10839 break;
10840 default:
10841 assert(0);
10842 }
10843 break;
10844 }
10845 case PyUnicode_2BYTE_KIND:
10846 {
10847 switch(kind2) {
10848 case PyUnicode_1BYTE_KIND:
10849 COMPARE(Py_UCS2, Py_UCS1);
10850 break;
10851 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010852 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010853 COMPARE(Py_UCS2, Py_UCS2);
10854 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010855 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010856 case PyUnicode_4BYTE_KIND:
10857 COMPARE(Py_UCS2, Py_UCS4);
10858 break;
10859 default:
10860 assert(0);
10861 }
10862 break;
10863 }
10864 case PyUnicode_4BYTE_KIND:
10865 {
10866 switch(kind2) {
10867 case PyUnicode_1BYTE_KIND:
10868 COMPARE(Py_UCS4, Py_UCS1);
10869 break;
10870 case PyUnicode_2BYTE_KIND:
10871 COMPARE(Py_UCS4, Py_UCS2);
10872 break;
10873 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010874 {
10875#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10876 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10877 /* normalize result of wmemcmp() into the range [-1; 1] */
10878 if (cmp < 0)
10879 return -1;
10880 if (cmp > 0)
10881 return 1;
10882#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010884#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010886 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 default:
10888 assert(0);
10889 }
10890 break;
10891 }
10892 default:
10893 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010894 }
10895
Victor Stinner770e19e2012-10-04 22:59:45 +020010896 if (len1 == len2)
10897 return 0;
10898 if (len1 < len2)
10899 return -1;
10900 else
10901 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902
10903#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010904}
10905
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010906Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010907unicode_compare_eq(PyObject *str1, PyObject *str2)
10908{
10909 int kind;
10910 void *data1, *data2;
10911 Py_ssize_t len;
10912 int cmp;
10913
Victor Stinnere5567ad2012-10-23 02:48:49 +020010914 len = PyUnicode_GET_LENGTH(str1);
10915 if (PyUnicode_GET_LENGTH(str2) != len)
10916 return 0;
10917 kind = PyUnicode_KIND(str1);
10918 if (PyUnicode_KIND(str2) != kind)
10919 return 0;
10920 data1 = PyUnicode_DATA(str1);
10921 data2 = PyUnicode_DATA(str2);
10922
10923 cmp = memcmp(data1, data2, len * kind);
10924 return (cmp == 0);
10925}
10926
10927
Alexander Belopolsky40018472011-02-26 01:02:56 +000010928int
10929PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10932 if (PyUnicode_READY(left) == -1 ||
10933 PyUnicode_READY(right) == -1)
10934 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010935
10936 /* a string is equal to itself */
10937 if (left == right)
10938 return 0;
10939
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010940 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010942 PyErr_Format(PyExc_TypeError,
10943 "Can't compare %.100s and %.100s",
10944 left->ob_type->tp_name,
10945 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946 return -1;
10947}
10948
Martin v. Löwis5b222132007-06-10 09:51:05 +000010949int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010950_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10951{
10952 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10953 if (right_str == NULL)
10954 return -1;
10955 return PyUnicode_Compare(left, right_str);
10956}
10957
10958int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010959PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 Py_ssize_t i;
10962 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 Py_UCS4 chr;
10964
Victor Stinner910337b2011-10-03 03:20:16 +020010965 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (PyUnicode_READY(uni) == -1)
10967 return -1;
10968 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010969 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010970 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010971 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010972 size_t len, len2 = strlen(str);
10973 int cmp;
10974
10975 len = Py_MIN(len1, len2);
10976 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010977 if (cmp != 0) {
10978 if (cmp < 0)
10979 return -1;
10980 else
10981 return 1;
10982 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010983 if (len1 > len2)
10984 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010985 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010986 return -1; /* str is longer */
10987 return 0;
10988 }
10989 else {
10990 void *data = PyUnicode_DATA(uni);
10991 /* Compare Unicode string and source character set string */
10992 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010993 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010994 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10995 /* This check keeps Python strings that end in '\0' from comparing equal
10996 to C strings identical up to that point. */
10997 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10998 return 1; /* uni is longer */
10999 if (str[i])
11000 return -1; /* str is longer */
11001 return 0;
11002 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011003}
11004
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011005
Benjamin Peterson29060642009-01-31 22:14:21 +000011006#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011007 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011008
Alexander Belopolsky40018472011-02-26 01:02:56 +000011009PyObject *
11010PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011011{
11012 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011013 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011014
Victor Stinnere5567ad2012-10-23 02:48:49 +020011015 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11016 Py_RETURN_NOTIMPLEMENTED;
11017
11018 if (PyUnicode_READY(left) == -1 ||
11019 PyUnicode_READY(right) == -1)
11020 return NULL;
11021
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011022 if (left == right) {
11023 switch (op) {
11024 case Py_EQ:
11025 case Py_LE:
11026 case Py_GE:
11027 /* a string is equal to itself */
11028 v = Py_True;
11029 break;
11030 case Py_NE:
11031 case Py_LT:
11032 case Py_GT:
11033 v = Py_False;
11034 break;
11035 default:
11036 PyErr_BadArgument();
11037 return NULL;
11038 }
11039 }
11040 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011041 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011042 result ^= (op == Py_NE);
11043 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011044 }
11045 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011046 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011047
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011048 /* Convert the return value to a Boolean */
11049 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011050 case Py_LE:
11051 v = TEST_COND(result <= 0);
11052 break;
11053 case Py_GE:
11054 v = TEST_COND(result >= 0);
11055 break;
11056 case Py_LT:
11057 v = TEST_COND(result == -1);
11058 break;
11059 case Py_GT:
11060 v = TEST_COND(result == 1);
11061 break;
11062 default:
11063 PyErr_BadArgument();
11064 return NULL;
11065 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011066 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011067 Py_INCREF(v);
11068 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011069}
11070
Alexander Belopolsky40018472011-02-26 01:02:56 +000011071int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011072_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11073{
11074 return unicode_eq(aa, bb);
11075}
11076
11077int
Alexander Belopolsky40018472011-02-26 01:02:56 +000011078PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011079{
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020011081 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 void *buf1, *buf2;
11083 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011084 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011085
11086 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000011087 sub = PyUnicode_FromObject(element);
11088 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011089 PyErr_Format(PyExc_TypeError,
11090 "'in <string>' requires string as left operand, not %s",
11091 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011093 }
11094
Thomas Wouters477c8d52006-05-27 19:21:47 +000011095 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011096 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011097 Py_DECREF(sub);
11098 return -1;
11099 }
11100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 kind1 = PyUnicode_KIND(str);
11102 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011103 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050011105 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011106 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 }
11108 len1 = PyUnicode_GET_LENGTH(str);
11109 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011110 if (len1 < len2) {
11111 Py_DECREF(sub);
11112 Py_DECREF(str);
11113 return 0;
11114 }
11115 buf1 = PyUnicode_DATA(str);
11116 buf2 = PyUnicode_DATA(sub);
11117 if (len2 == 1) {
11118 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11119 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11120 Py_DECREF(sub);
11121 Py_DECREF(str);
11122 return result;
11123 }
11124 if (kind2 != kind1) {
11125 buf2 = _PyUnicode_AsKind(sub, kind1);
11126 if (!buf2) {
11127 Py_DECREF(sub);
11128 Py_DECREF(str);
11129 return -1;
11130 }
11131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132
Victor Stinner77282cb2013-04-14 19:22:47 +020011133 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 case PyUnicode_1BYTE_KIND:
11135 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11136 break;
11137 case PyUnicode_2BYTE_KIND:
11138 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11139 break;
11140 case PyUnicode_4BYTE_KIND:
11141 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11142 break;
11143 default:
11144 result = -1;
11145 assert(0);
11146 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147
11148 Py_DECREF(str);
11149 Py_DECREF(sub);
11150
Victor Stinner77282cb2013-04-14 19:22:47 +020011151 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 PyMem_Free(buf2);
11153
Guido van Rossum403d68b2000-03-13 15:55:09 +000011154 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011155}
11156
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157/* Concat to string or Unicode object giving a new Unicode object. */
11158
Alexander Belopolsky40018472011-02-26 01:02:56 +000011159PyObject *
11160PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011163 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011164 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165
11166 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011169 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173
11174 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011175 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011179 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011180 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 }
11183
Victor Stinner488fa492011-12-12 00:01:39 +010011184 u_len = PyUnicode_GET_LENGTH(u);
11185 v_len = PyUnicode_GET_LENGTH(v);
11186 if (u_len > PY_SSIZE_T_MAX - v_len) {
11187 PyErr_SetString(PyExc_OverflowError,
11188 "strings are too large to concat");
11189 goto onError;
11190 }
11191 new_len = u_len + v_len;
11192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011194 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011195 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011198 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011200 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011201 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11202 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203 Py_DECREF(u);
11204 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011205 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209 Py_XDECREF(u);
11210 Py_XDECREF(v);
11211 return NULL;
11212}
11213
Walter Dörwald1ab83302007-05-18 17:15:44 +000011214void
Victor Stinner23e56682011-10-03 03:54:37 +020011215PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011216{
Victor Stinner23e56682011-10-03 03:54:37 +020011217 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011218 Py_UCS4 maxchar, maxchar2;
11219 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011220
11221 if (p_left == NULL) {
11222 if (!PyErr_Occurred())
11223 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011224 return;
11225 }
Victor Stinner23e56682011-10-03 03:54:37 +020011226 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011227 if (right == NULL || left == NULL
11228 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011229 if (!PyErr_Occurred())
11230 PyErr_BadInternalCall();
11231 goto error;
11232 }
11233
Benjamin Petersonbac79492012-01-14 13:34:47 -050011234 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011235 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011236 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011237 goto error;
11238
Victor Stinner488fa492011-12-12 00:01:39 +010011239 /* Shortcuts */
11240 if (left == unicode_empty) {
11241 Py_DECREF(left);
11242 Py_INCREF(right);
11243 *p_left = right;
11244 return;
11245 }
11246 if (right == unicode_empty)
11247 return;
11248
11249 left_len = PyUnicode_GET_LENGTH(left);
11250 right_len = PyUnicode_GET_LENGTH(right);
11251 if (left_len > PY_SSIZE_T_MAX - right_len) {
11252 PyErr_SetString(PyExc_OverflowError,
11253 "strings are too large to concat");
11254 goto error;
11255 }
11256 new_len = left_len + right_len;
11257
11258 if (unicode_modifiable(left)
11259 && PyUnicode_CheckExact(right)
11260 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011261 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11262 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011263 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011264 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011265 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11266 {
11267 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011268 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011269 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011270
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011271 /* copy 'right' into the newly allocated area of 'left' */
11272 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011273 }
Victor Stinner488fa492011-12-12 00:01:39 +010011274 else {
11275 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11276 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011277 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011278
Victor Stinner488fa492011-12-12 00:01:39 +010011279 /* Concat the two Unicode strings */
11280 res = PyUnicode_New(new_len, maxchar);
11281 if (res == NULL)
11282 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011283 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11284 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011285 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011286 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011287 }
11288 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011289 return;
11290
11291error:
Victor Stinner488fa492011-12-12 00:01:39 +010011292 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011293}
11294
11295void
11296PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11297{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011298 PyUnicode_Append(pleft, right);
11299 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011300}
11301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011302PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011305Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011306string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011307interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308
11309static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011310unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011312 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011313 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011314 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011316 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 void *buf1, *buf2;
11318 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Jesus Ceaac451502011-04-20 17:09:23 +020011320 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11321 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 kind1 = PyUnicode_KIND(self);
11325 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011326 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011327 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011328 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 len1 = PyUnicode_GET_LENGTH(self);
11331 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011333 if (end - start < len2) {
11334 Py_DECREF(substring);
11335 return PyLong_FromLong(0);
11336 }
11337 buf1 = PyUnicode_DATA(self);
11338 buf2 = PyUnicode_DATA(substring);
11339 if (kind2 != kind1) {
11340 buf2 = _PyUnicode_AsKind(substring, kind1);
11341 if (!buf2) {
11342 Py_DECREF(substring);
11343 return NULL;
11344 }
11345 }
11346 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 case PyUnicode_1BYTE_KIND:
11348 iresult = ucs1lib_count(
11349 ((Py_UCS1*)buf1) + start, end - start,
11350 buf2, len2, PY_SSIZE_T_MAX
11351 );
11352 break;
11353 case PyUnicode_2BYTE_KIND:
11354 iresult = ucs2lib_count(
11355 ((Py_UCS2*)buf1) + start, end - start,
11356 buf2, len2, PY_SSIZE_T_MAX
11357 );
11358 break;
11359 case PyUnicode_4BYTE_KIND:
11360 iresult = ucs4lib_count(
11361 ((Py_UCS4*)buf1) + start, end - start,
11362 buf2, len2, PY_SSIZE_T_MAX
11363 );
11364 break;
11365 default:
11366 assert(0); iresult = 0;
11367 }
11368
11369 result = PyLong_FromSsize_t(iresult);
11370
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011371 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373
11374 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011375
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376 return result;
11377}
11378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011379PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011380 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011382Encode S using the codec registered for encoding. Default encoding\n\
11383is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011384handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011385a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11386'xmlcharrefreplace' as well as any other name registered with\n\
11387codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388
11389static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011390unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011392 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 char *encoding = NULL;
11394 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011395
Benjamin Peterson308d6372009-09-18 21:42:35 +000011396 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11397 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011399 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011400}
11401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011403 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404\n\
11405Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011409unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011411 Py_ssize_t i, j, line_pos, src_len, incr;
11412 Py_UCS4 ch;
11413 PyObject *u;
11414 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011415 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011417 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011418 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
Ezio Melotti745d54d2013-11-16 19:10:57 +020011420 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11421 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
Antoine Pitrou22425222011-10-04 19:10:51 +020011424 if (PyUnicode_READY(self) == -1)
11425 return NULL;
11426
Thomas Wouters7e474022000-07-16 12:04:32 +000011427 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011428 src_len = PyUnicode_GET_LENGTH(self);
11429 i = j = line_pos = 0;
11430 kind = PyUnicode_KIND(self);
11431 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011432 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011433 for (; i < src_len; i++) {
11434 ch = PyUnicode_READ(kind, src_data, i);
11435 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011436 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011438 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011440 goto overflow;
11441 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011443 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011447 goto overflow;
11448 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011450 if (ch == '\n' || ch == '\r')
11451 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011453 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011454 if (!found)
11455 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011456
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011458 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 if (!u)
11460 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011461 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 for (; i < src_len; i++) {
11466 ch = PyUnicode_READ(kind, src_data, i);
11467 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 incr = tabsize - (line_pos % tabsize);
11470 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011471 FILL(kind, dest_data, ' ', j, incr);
11472 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011474 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011476 line_pos++;
11477 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011478 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011479 if (ch == '\n' || ch == '\r')
11480 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011482 }
11483 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011484 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011485
Antoine Pitroue71d5742011-10-04 15:55:09 +020011486 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011487 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489}
11490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011491PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493\n\
11494Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011495such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496arguments start and end are interpreted as in slice notation.\n\
11497\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499
11500static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011503 /* initialize variables to prevent gcc warning */
11504 PyObject *substring = NULL;
11505 Py_ssize_t start = 0;
11506 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011507 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Jesus Ceaac451502011-04-20 17:09:23 +020011509 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11510 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
Christian Heimesd47802e2013-06-29 21:33:36 +020011513 if (PyUnicode_READY(self) == -1) {
11514 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011516 }
11517 if (PyUnicode_READY(substring) == -1) {
11518 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521
Victor Stinner7931d9a2011-11-04 00:22:48 +010011522 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
11524 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 if (result == -2)
11527 return NULL;
11528
Christian Heimes217cfd12007-12-02 14:31:20 +000011529 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530}
11531
11532static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011533unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011535 void *data;
11536 enum PyUnicode_Kind kind;
11537 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011538
11539 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11540 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011542 }
11543 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11544 PyErr_SetString(PyExc_IndexError, "string index out of range");
11545 return NULL;
11546 }
11547 kind = PyUnicode_KIND(self);
11548 data = PyUnicode_DATA(self);
11549 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011550 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551}
11552
Guido van Rossumc2504932007-09-18 19:42:40 +000011553/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011554 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011555static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011556unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Guido van Rossumc2504932007-09-18 19:42:40 +000011558 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011559 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011560
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011561#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011562 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011563#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 if (_PyUnicode_HASH(self) != -1)
11565 return _PyUnicode_HASH(self);
11566 if (PyUnicode_READY(self) == -1)
11567 return -1;
11568 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011569 /*
11570 We make the hash of the empty string be 0, rather than using
11571 (prefix ^ suffix), since this slightly obfuscates the hash secret
11572 */
11573 if (len == 0) {
11574 _PyUnicode_HASH(self) = 0;
11575 return 0;
11576 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011577 x = _Py_HashBytes(PyUnicode_DATA(self),
11578 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011580 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581}
11582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011583PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011586Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
11588static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011591 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011592 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011593 PyObject *substring = NULL;
11594 Py_ssize_t start = 0;
11595 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596
Jesus Ceaac451502011-04-20 17:09:23 +020011597 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11598 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
Christian Heimesd47a0452013-06-29 21:21:37 +020011601 if (PyUnicode_READY(self) == -1) {
11602 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011604 }
11605 if (PyUnicode_READY(substring) == -1) {
11606 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609
Victor Stinner7931d9a2011-11-04 00:22:48 +010011610 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
11612 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (result == -2)
11615 return NULL;
11616
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 if (result < 0) {
11618 PyErr_SetString(PyExc_ValueError, "substring not found");
11619 return NULL;
11620 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621
Christian Heimes217cfd12007-12-02 14:31:20 +000011622 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623}
11624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011625PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011628Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011629at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
11631static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011632unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 Py_ssize_t i, length;
11635 int kind;
11636 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 int cased;
11638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (PyUnicode_READY(self) == -1)
11640 return NULL;
11641 length = PyUnicode_GET_LENGTH(self);
11642 kind = PyUnicode_KIND(self);
11643 data = PyUnicode_DATA(self);
11644
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 if (length == 1)
11647 return PyBool_FromLong(
11648 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011650 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011653
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 for (i = 0; i < length; i++) {
11656 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011657
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11659 return PyBool_FromLong(0);
11660 else if (!cased && Py_UNICODE_ISLOWER(ch))
11661 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011663 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664}
11665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011666PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011669Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011670at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
11672static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011673unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 Py_ssize_t i, length;
11676 int kind;
11677 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 int cased;
11679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 if (PyUnicode_READY(self) == -1)
11681 return NULL;
11682 length = PyUnicode_GET_LENGTH(self);
11683 kind = PyUnicode_KIND(self);
11684 data = PyUnicode_DATA(self);
11685
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 if (length == 1)
11688 return PyBool_FromLong(
11689 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011691 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011694
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 for (i = 0; i < length; i++) {
11697 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011698
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11700 return PyBool_FromLong(0);
11701 else if (!cased && Py_UNICODE_ISUPPER(ch))
11702 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011704 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705}
11706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011707PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011710Return True if S is a titlecased string and there is at least one\n\
11711character in S, i.e. upper- and titlecase characters may only\n\
11712follow uncased characters and lowercase characters only cased ones.\n\
11713Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
11715static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011716unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 Py_ssize_t i, length;
11719 int kind;
11720 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 int cased, previous_is_cased;
11722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 if (PyUnicode_READY(self) == -1)
11724 return NULL;
11725 length = PyUnicode_GET_LENGTH(self);
11726 kind = PyUnicode_KIND(self);
11727 data = PyUnicode_DATA(self);
11728
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (length == 1) {
11731 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11732 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11733 (Py_UNICODE_ISUPPER(ch) != 0));
11734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011736 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011739
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 cased = 0;
11741 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 for (i = 0; i < length; i++) {
11743 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011744
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11746 if (previous_is_cased)
11747 return PyBool_FromLong(0);
11748 previous_is_cased = 1;
11749 cased = 1;
11750 }
11751 else if (Py_UNICODE_ISLOWER(ch)) {
11752 if (!previous_is_cased)
11753 return PyBool_FromLong(0);
11754 previous_is_cased = 1;
11755 cased = 1;
11756 }
11757 else
11758 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011760 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761}
11762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011766Return True if all characters in S are whitespace\n\
11767and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
11769static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011770unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 Py_ssize_t i, length;
11773 int kind;
11774 void *data;
11775
11776 if (PyUnicode_READY(self) == -1)
11777 return NULL;
11778 length = PyUnicode_GET_LENGTH(self);
11779 kind = PyUnicode_KIND(self);
11780 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (length == 1)
11784 return PyBool_FromLong(
11785 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011787 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 for (i = 0; i < length; i++) {
11792 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011793 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011796 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797}
11798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011799PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011801\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011802Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011803and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011804
11805static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011806unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 Py_ssize_t i, length;
11809 int kind;
11810 void *data;
11811
11812 if (PyUnicode_READY(self) == -1)
11813 return NULL;
11814 length = PyUnicode_GET_LENGTH(self);
11815 kind = PyUnicode_KIND(self);
11816 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011817
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011818 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (length == 1)
11820 return PyBool_FromLong(
11821 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011822
11823 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 for (i = 0; i < length; i++) {
11828 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011831 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011832}
11833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011834PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011836\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011837Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011838and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011839
11840static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011841unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 int kind;
11844 void *data;
11845 Py_ssize_t len, i;
11846
11847 if (PyUnicode_READY(self) == -1)
11848 return NULL;
11849
11850 kind = PyUnicode_KIND(self);
11851 data = PyUnicode_DATA(self);
11852 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011853
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011854 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 if (len == 1) {
11856 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11857 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11858 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011859
11860 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 for (i = 0; i < len; i++) {
11865 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011866 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011868 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870}
11871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011872PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011875Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011876False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877
11878static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011879unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 Py_ssize_t i, length;
11882 int kind;
11883 void *data;
11884
11885 if (PyUnicode_READY(self) == -1)
11886 return NULL;
11887 length = PyUnicode_GET_LENGTH(self);
11888 kind = PyUnicode_KIND(self);
11889 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 if (length == 1)
11893 return PyBool_FromLong(
11894 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011896 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 for (i = 0; i < length; i++) {
11901 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011904 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905}
11906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011907PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011910Return True if all characters in S are digits\n\
11911and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
11913static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011914unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 Py_ssize_t i, length;
11917 int kind;
11918 void *data;
11919
11920 if (PyUnicode_READY(self) == -1)
11921 return NULL;
11922 length = PyUnicode_GET_LENGTH(self);
11923 kind = PyUnicode_KIND(self);
11924 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 if (length == 1) {
11928 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11929 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 for (i = 0; i < length; i++) {
11937 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011940 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941}
11942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011943PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011946Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011947False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
11949static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011950unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 Py_ssize_t i, length;
11953 int kind;
11954 void *data;
11955
11956 if (PyUnicode_READY(self) == -1)
11957 return NULL;
11958 length = PyUnicode_GET_LENGTH(self);
11959 kind = PyUnicode_KIND(self);
11960 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 1)
11964 return PyBool_FromLong(
11965 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011967 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 for (i = 0; i < length; i++) {
11972 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976}
11977
Martin v. Löwis47383402007-08-15 07:32:56 +000011978int
11979PyUnicode_IsIdentifier(PyObject *self)
11980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 int kind;
11982 void *data;
11983 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011984 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 if (PyUnicode_READY(self) == -1) {
11987 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 }
11990
11991 /* Special case for empty strings */
11992 if (PyUnicode_GET_LENGTH(self) == 0)
11993 return 0;
11994 kind = PyUnicode_KIND(self);
11995 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011996
11997 /* PEP 3131 says that the first character must be in
11998 XID_Start and subsequent characters in XID_Continue,
11999 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012001 letters, digits, underscore). However, given the current
12002 definition of XID_Start and XID_Continue, it is sufficient
12003 to check just for these, except that _ must be allowed
12004 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012006 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012007 return 0;
12008
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012009 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012012 return 1;
12013}
12014
12015PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012017\n\
12018Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012019to the language definition.\n\
12020\n\
12021Use keyword.iskeyword() to test for reserved identifiers\n\
12022such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012023
12024static PyObject*
12025unicode_isidentifier(PyObject *self)
12026{
12027 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12028}
12029
Georg Brandl559e5d72008-06-11 18:37:52 +000012030PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012031 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012032\n\
12033Return True if all characters in S are considered\n\
12034printable in repr() or S is empty, False otherwise.");
12035
12036static PyObject*
12037unicode_isprintable(PyObject *self)
12038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 Py_ssize_t i, length;
12040 int kind;
12041 void *data;
12042
12043 if (PyUnicode_READY(self) == -1)
12044 return NULL;
12045 length = PyUnicode_GET_LENGTH(self);
12046 kind = PyUnicode_KIND(self);
12047 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012048
12049 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 if (length == 1)
12051 return PyBool_FromLong(
12052 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 for (i = 0; i < length; i++) {
12055 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012056 Py_RETURN_FALSE;
12057 }
12058 }
12059 Py_RETURN_TRUE;
12060}
12061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012062PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012063 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064\n\
12065Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012066iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
12068static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012069unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012071 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072}
12073
Martin v. Löwis18e16552006-02-15 17:27:45 +000012074static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012075unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 if (PyUnicode_READY(self) == -1)
12078 return -1;
12079 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080}
12081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012082PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012085Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012086done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
12088static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012089unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012091 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 Py_UCS4 fillchar = ' ';
12093
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012094 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 return NULL;
12096
Benjamin Petersonbac79492012-01-14 13:34:47 -050012097 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
Victor Stinnerc4b49542011-12-11 22:44:26 +010012100 if (PyUnicode_GET_LENGTH(self) >= width)
12101 return unicode_result_unchanged(self);
12102
12103 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104}
12105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012106PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012109Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
12111static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012112unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012114 if (PyUnicode_READY(self) == -1)
12115 return NULL;
12116 if (PyUnicode_IS_ASCII(self))
12117 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012118 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119}
12120
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121#define LEFTSTRIP 0
12122#define RIGHTSTRIP 1
12123#define BOTHSTRIP 2
12124
12125/* Arrays indexed by above */
12126static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12127
12128#define STRIPNAME(i) (stripformat[i]+3)
12129
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130/* externally visible for str.strip(unicode) */
12131PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012132_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 void *data;
12135 int kind;
12136 Py_ssize_t i, j, len;
12137 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012138 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12141 return NULL;
12142
12143 kind = PyUnicode_KIND(self);
12144 data = PyUnicode_DATA(self);
12145 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012146 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12148 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012149 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012150
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 i = 0;
12152 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012153 while (i < len) {
12154 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12155 if (!BLOOM(sepmask, ch))
12156 break;
12157 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12158 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 i++;
12160 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012161 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012162
Benjamin Peterson14339b62009-01-31 16:36:08 +000012163 j = len;
12164 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012165 j--;
12166 while (j >= i) {
12167 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12168 if (!BLOOM(sepmask, ch))
12169 break;
12170 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12171 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012173 }
12174
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012176 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012177
Victor Stinner7931d9a2011-11-04 00:22:48 +010012178 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179}
12180
12181PyObject*
12182PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12183{
12184 unsigned char *data;
12185 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012186 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187
Victor Stinnerde636f32011-10-01 03:55:54 +020012188 if (PyUnicode_READY(self) == -1)
12189 return NULL;
12190
Victor Stinner684d5fd2012-05-03 02:32:34 +020012191 length = PyUnicode_GET_LENGTH(self);
12192 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012193
Victor Stinner684d5fd2012-05-03 02:32:34 +020012194 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012195 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196
Victor Stinnerde636f32011-10-01 03:55:54 +020012197 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012198 PyErr_SetString(PyExc_IndexError, "string index out of range");
12199 return NULL;
12200 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012201 if (start >= length || end < start)
12202 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012203
Victor Stinner684d5fd2012-05-03 02:32:34 +020012204 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012205 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012206 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012207 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012208 }
12209 else {
12210 kind = PyUnicode_KIND(self);
12211 data = PyUnicode_1BYTE_DATA(self);
12212 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012213 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012214 length);
12215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217
12218static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012219do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 Py_ssize_t len, i, j;
12222
12223 if (PyUnicode_READY(self) == -1)
12224 return NULL;
12225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012227
Victor Stinnercc7af722013-04-09 22:39:24 +020012228 if (PyUnicode_IS_ASCII(self)) {
12229 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12230
12231 i = 0;
12232 if (striptype != RIGHTSTRIP) {
12233 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012234 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012235 if (!_Py_ascii_whitespace[ch])
12236 break;
12237 i++;
12238 }
12239 }
12240
12241 j = len;
12242 if (striptype != LEFTSTRIP) {
12243 j--;
12244 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012245 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012246 if (!_Py_ascii_whitespace[ch])
12247 break;
12248 j--;
12249 }
12250 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012251 }
12252 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012253 else {
12254 int kind = PyUnicode_KIND(self);
12255 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012256
Victor Stinnercc7af722013-04-09 22:39:24 +020012257 i = 0;
12258 if (striptype != RIGHTSTRIP) {
12259 while (i < len) {
12260 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12261 if (!Py_UNICODE_ISSPACE(ch))
12262 break;
12263 i++;
12264 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012265 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012266
12267 j = len;
12268 if (striptype != LEFTSTRIP) {
12269 j--;
12270 while (j >= i) {
12271 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12272 if (!Py_UNICODE_ISSPACE(ch))
12273 break;
12274 j--;
12275 }
12276 j++;
12277 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012279
Victor Stinner7931d9a2011-11-04 00:22:48 +010012280 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281}
12282
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012283
12284static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012285do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012286{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012287 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288
Serhiy Storchakac6792272013-10-19 21:03:34 +030012289 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012290 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291
Benjamin Peterson14339b62009-01-31 16:36:08 +000012292 if (sep != NULL && sep != Py_None) {
12293 if (PyUnicode_Check(sep))
12294 return _PyUnicode_XStrip(self, striptype, sep);
12295 else {
12296 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012297 "%s arg must be None or str",
12298 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012299 return NULL;
12300 }
12301 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304}
12305
12306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012307PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012309\n\
12310Return a copy of the string S with leading and trailing\n\
12311whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012312If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012313
12314static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012315unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012316{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012317 if (PyTuple_GET_SIZE(args) == 0)
12318 return do_strip(self, BOTHSTRIP); /* Common case */
12319 else
12320 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012321}
12322
12323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012324PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326\n\
12327Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012328If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012329
12330static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012331unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 if (PyTuple_GET_SIZE(args) == 0)
12334 return do_strip(self, LEFTSTRIP); /* Common case */
12335 else
12336 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012337}
12338
12339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012340PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342\n\
12343Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012344If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012345
12346static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012347unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012348{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012349 if (PyTuple_GET_SIZE(args) == 0)
12350 return do_strip(self, RIGHTSTRIP); /* Common case */
12351 else
12352 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012353}
12354
12355
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012357unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012359 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
Serhiy Storchaka05997252013-01-26 12:14:02 +020012362 if (len < 1)
12363 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364
Victor Stinnerc4b49542011-12-11 22:44:26 +010012365 /* no repeat, return original string */
12366 if (len == 1)
12367 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012368
Benjamin Petersonbac79492012-01-14 13:34:47 -050012369 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 return NULL;
12371
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012372 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012373 PyErr_SetString(PyExc_OverflowError,
12374 "repeated string is too long");
12375 return NULL;
12376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012378
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012379 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380 if (!u)
12381 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012382 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 if (PyUnicode_GET_LENGTH(str) == 1) {
12385 const int kind = PyUnicode_KIND(str);
12386 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012387 if (kind == PyUnicode_1BYTE_KIND) {
12388 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012389 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012390 }
12391 else if (kind == PyUnicode_2BYTE_KIND) {
12392 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012393 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012394 ucs2[n] = fill_char;
12395 } else {
12396 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12397 assert(kind == PyUnicode_4BYTE_KIND);
12398 for (n = 0; n < len; ++n)
12399 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 }
12402 else {
12403 /* number of characters copied this far */
12404 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012405 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 char *to = (char *) PyUnicode_DATA(u);
12407 Py_MEMCPY(to, PyUnicode_DATA(str),
12408 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 n = (done <= nchars-done) ? done : nchars-done;
12411 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012412 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414 }
12415
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012416 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012417 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418}
12419
Alexander Belopolsky40018472011-02-26 01:02:56 +000012420PyObject *
12421PyUnicode_Replace(PyObject *obj,
12422 PyObject *subobj,
12423 PyObject *replobj,
12424 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425{
12426 PyObject *self;
12427 PyObject *str1;
12428 PyObject *str2;
12429 PyObject *result;
12430
12431 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012432 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012435 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 Py_DECREF(self);
12437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438 }
12439 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012440 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 Py_DECREF(self);
12442 Py_DECREF(str1);
12443 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012445 if (PyUnicode_READY(self) == -1 ||
12446 PyUnicode_READY(str1) == -1 ||
12447 PyUnicode_READY(str2) == -1)
12448 result = NULL;
12449 else
12450 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451 Py_DECREF(self);
12452 Py_DECREF(str1);
12453 Py_DECREF(str2);
12454 return result;
12455}
12456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012457PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012458 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459\n\
12460Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012461old replaced by new. If the optional argument count is\n\
12462given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463
12464static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 PyObject *str1;
12468 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012469 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470 PyObject *result;
12471
Martin v. Löwis18e16552006-02-15 17:27:45 +000012472 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012474 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012477 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 return NULL;
12479 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012480 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 Py_DECREF(str1);
12482 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012483 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012484 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12485 result = NULL;
12486 else
12487 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488
12489 Py_DECREF(str1);
12490 Py_DECREF(str2);
12491 return result;
12492}
12493
Alexander Belopolsky40018472011-02-26 01:02:56 +000012494static PyObject *
12495unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012497 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 Py_ssize_t isize;
12499 Py_ssize_t osize, squote, dquote, i, o;
12500 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012501 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012505 return NULL;
12506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 isize = PyUnicode_GET_LENGTH(unicode);
12508 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 /* Compute length of output, quote characters, and
12511 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012512 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 max = 127;
12514 squote = dquote = 0;
12515 ikind = PyUnicode_KIND(unicode);
12516 for (i = 0; i < isize; i++) {
12517 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012518 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012520 case '\'': squote++; break;
12521 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012523 incr = 2;
12524 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 default:
12526 /* Fast-path ASCII */
12527 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012528 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012530 ;
12531 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012534 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012536 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012538 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012540 if (osize > PY_SSIZE_T_MAX - incr) {
12541 PyErr_SetString(PyExc_OverflowError,
12542 "string is too long to generate repr");
12543 return NULL;
12544 }
12545 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 }
12547
12548 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012549 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012551 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 if (dquote)
12553 /* Both squote and dquote present. Use squote,
12554 and escape them */
12555 osize += squote;
12556 else
12557 quote = '"';
12558 }
Victor Stinner55c08782013-04-14 18:45:39 +020012559 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560
12561 repr = PyUnicode_New(osize, max);
12562 if (repr == NULL)
12563 return NULL;
12564 okind = PyUnicode_KIND(repr);
12565 odata = PyUnicode_DATA(repr);
12566
12567 PyUnicode_WRITE(okind, odata, 0, quote);
12568 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012569 if (unchanged) {
12570 _PyUnicode_FastCopyCharacters(repr, 1,
12571 unicode, 0,
12572 isize);
12573 }
12574 else {
12575 for (i = 0, o = 1; i < isize; i++) {
12576 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577
Victor Stinner55c08782013-04-14 18:45:39 +020012578 /* Escape quotes and backslashes */
12579 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012580 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012582 continue;
12583 }
12584
12585 /* Map special whitespace to '\t', \n', '\r' */
12586 if (ch == '\t') {
12587 PyUnicode_WRITE(okind, odata, o++, '\\');
12588 PyUnicode_WRITE(okind, odata, o++, 't');
12589 }
12590 else if (ch == '\n') {
12591 PyUnicode_WRITE(okind, odata, o++, '\\');
12592 PyUnicode_WRITE(okind, odata, o++, 'n');
12593 }
12594 else if (ch == '\r') {
12595 PyUnicode_WRITE(okind, odata, o++, '\\');
12596 PyUnicode_WRITE(okind, odata, o++, 'r');
12597 }
12598
12599 /* Map non-printable US ASCII to '\xhh' */
12600 else if (ch < ' ' || ch == 0x7F) {
12601 PyUnicode_WRITE(okind, odata, o++, '\\');
12602 PyUnicode_WRITE(okind, odata, o++, 'x');
12603 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12604 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12605 }
12606
12607 /* Copy ASCII characters as-is */
12608 else if (ch < 0x7F) {
12609 PyUnicode_WRITE(okind, odata, o++, ch);
12610 }
12611
12612 /* Non-ASCII characters */
12613 else {
12614 /* Map Unicode whitespace and control characters
12615 (categories Z* and C* except ASCII space)
12616 */
12617 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12618 PyUnicode_WRITE(okind, odata, o++, '\\');
12619 /* Map 8-bit characters to '\xhh' */
12620 if (ch <= 0xff) {
12621 PyUnicode_WRITE(okind, odata, o++, 'x');
12622 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12623 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12624 }
12625 /* Map 16-bit characters to '\uxxxx' */
12626 else if (ch <= 0xffff) {
12627 PyUnicode_WRITE(okind, odata, o++, 'u');
12628 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12629 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12630 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12631 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12632 }
12633 /* Map 21-bit characters to '\U00xxxxxx' */
12634 else {
12635 PyUnicode_WRITE(okind, odata, o++, 'U');
12636 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12637 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12638 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12639 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12640 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12641 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12642 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12643 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12644 }
12645 }
12646 /* Copy characters as-is */
12647 else {
12648 PyUnicode_WRITE(okind, odata, o++, ch);
12649 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012650 }
12651 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012654 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012655 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656}
12657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012658PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660\n\
12661Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012662such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663arguments start and end are interpreted as in slice notation.\n\
12664\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012665Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666
12667static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012670 /* initialize variables to prevent gcc warning */
12671 PyObject *substring = NULL;
12672 Py_ssize_t start = 0;
12673 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012674 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
Jesus Ceaac451502011-04-20 17:09:23 +020012676 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12677 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679
Christian Heimesea71a522013-06-29 21:17:34 +020012680 if (PyUnicode_READY(self) == -1) {
12681 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012683 }
12684 if (PyUnicode_READY(substring) == -1) {
12685 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688
Victor Stinner7931d9a2011-11-04 00:22:48 +010012689 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690
12691 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 if (result == -2)
12694 return NULL;
12695
Christian Heimes217cfd12007-12-02 14:31:20 +000012696 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697}
12698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012699PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
12704static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012707 /* initialize variables to prevent gcc warning */
12708 PyObject *substring = NULL;
12709 Py_ssize_t start = 0;
12710 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712
Jesus Ceaac451502011-04-20 17:09:23 +020012713 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12714 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716
Christian Heimesea71a522013-06-29 21:17:34 +020012717 if (PyUnicode_READY(self) == -1) {
12718 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012720 }
12721 if (PyUnicode_READY(substring) == -1) {
12722 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725
Victor Stinner7931d9a2011-11-04 00:22:48 +010012726 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727
12728 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 if (result == -2)
12731 return NULL;
12732
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733 if (result < 0) {
12734 PyErr_SetString(PyExc_ValueError, "substring not found");
12735 return NULL;
12736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737
Christian Heimes217cfd12007-12-02 14:31:20 +000012738 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739}
12740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012741PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012744Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012745done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
12747static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012748unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012750 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 Py_UCS4 fillchar = ' ';
12752
Victor Stinnere9a29352011-10-01 02:14:59 +020012753 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012755
Benjamin Petersonbac79492012-01-14 13:34:47 -050012756 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012757 return NULL;
12758
Victor Stinnerc4b49542011-12-11 22:44:26 +010012759 if (PyUnicode_GET_LENGTH(self) >= width)
12760 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761
Victor Stinnerc4b49542011-12-11 22:44:26 +010012762 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763}
12764
Alexander Belopolsky40018472011-02-26 01:02:56 +000012765PyObject *
12766PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767{
12768 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012769
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770 s = PyUnicode_FromObject(s);
12771 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 if (sep != NULL) {
12774 sep = PyUnicode_FromObject(sep);
12775 if (sep == NULL) {
12776 Py_DECREF(s);
12777 return NULL;
12778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779 }
12780
Victor Stinner9310abb2011-10-05 00:59:23 +020012781 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782
12783 Py_DECREF(s);
12784 Py_XDECREF(sep);
12785 return result;
12786}
12787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012788PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012789 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790\n\
12791Return a list of the words in S, using sep as the\n\
12792delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012793splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012794whitespace string is a separator and empty strings are\n\
12795removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796
12797static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012798unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012800 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012802 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012804 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12805 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806 return NULL;
12807
12808 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012811 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012813 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814}
12815
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816PyObject *
12817PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12818{
12819 PyObject* str_obj;
12820 PyObject* sep_obj;
12821 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012822 int kind1, kind2;
12823 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825
12826 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012827 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012829 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012830 if (!sep_obj) {
12831 Py_DECREF(str_obj);
12832 return NULL;
12833 }
12834 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12835 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012836 Py_DECREF(str_obj);
12837 return NULL;
12838 }
12839
Victor Stinner14f8f022011-10-05 20:58:25 +020012840 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 len1 = PyUnicode_GET_LENGTH(str_obj);
12843 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012844 if (kind1 < kind2 || len1 < len2) {
12845 _Py_INCREF_UNICODE_EMPTY();
12846 if (!unicode_empty)
12847 out = NULL;
12848 else {
12849 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12850 Py_DECREF(unicode_empty);
12851 }
12852 Py_DECREF(sep_obj);
12853 Py_DECREF(str_obj);
12854 return out;
12855 }
12856 buf1 = PyUnicode_DATA(str_obj);
12857 buf2 = PyUnicode_DATA(sep_obj);
12858 if (kind2 != kind1) {
12859 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12860 if (!buf2)
12861 goto onError;
12862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012864 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012866 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12867 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12868 else
12869 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 break;
12871 case PyUnicode_2BYTE_KIND:
12872 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12873 break;
12874 case PyUnicode_4BYTE_KIND:
12875 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12876 break;
12877 default:
12878 assert(0);
12879 out = 0;
12880 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012881
12882 Py_DECREF(sep_obj);
12883 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012884 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012886
12887 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 onError:
12889 Py_DECREF(sep_obj);
12890 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012891 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 PyMem_Free(buf2);
12893 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012894}
12895
12896
12897PyObject *
12898PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12899{
12900 PyObject* str_obj;
12901 PyObject* sep_obj;
12902 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012903 int kind1, kind2;
12904 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906
12907 str_obj = PyUnicode_FromObject(str_in);
12908 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910 sep_obj = PyUnicode_FromObject(sep_in);
12911 if (!sep_obj) {
12912 Py_DECREF(str_obj);
12913 return NULL;
12914 }
12915
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012916 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 len1 = PyUnicode_GET_LENGTH(str_obj);
12919 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012920 if (kind1 < kind2 || len1 < len2) {
12921 _Py_INCREF_UNICODE_EMPTY();
12922 if (!unicode_empty)
12923 out = NULL;
12924 else {
12925 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12926 Py_DECREF(unicode_empty);
12927 }
12928 Py_DECREF(sep_obj);
12929 Py_DECREF(str_obj);
12930 return out;
12931 }
12932 buf1 = PyUnicode_DATA(str_obj);
12933 buf2 = PyUnicode_DATA(sep_obj);
12934 if (kind2 != kind1) {
12935 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12936 if (!buf2)
12937 goto onError;
12938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012940 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012942 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12943 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12944 else
12945 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 break;
12947 case PyUnicode_2BYTE_KIND:
12948 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12949 break;
12950 case PyUnicode_4BYTE_KIND:
12951 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12952 break;
12953 default:
12954 assert(0);
12955 out = 0;
12956 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012957
12958 Py_DECREF(sep_obj);
12959 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012960 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012962
12963 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 onError:
12965 Py_DECREF(sep_obj);
12966 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012967 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 PyMem_Free(buf2);
12969 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970}
12971
12972PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012975Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012977found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978
12979static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012980unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981{
Victor Stinner9310abb2011-10-05 00:59:23 +020012982 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983}
12984
12985PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012986 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012988Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012990separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991
12992static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012993unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994{
Victor Stinner9310abb2011-10-05 00:59:23 +020012995 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996}
12997
Alexander Belopolsky40018472011-02-26 01:02:56 +000012998PyObject *
12999PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013000{
13001 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013002
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013003 s = PyUnicode_FromObject(s);
13004 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013005 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 if (sep != NULL) {
13007 sep = PyUnicode_FromObject(sep);
13008 if (sep == NULL) {
13009 Py_DECREF(s);
13010 return NULL;
13011 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013012 }
13013
Victor Stinner9310abb2011-10-05 00:59:23 +020013014 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013015
13016 Py_DECREF(s);
13017 Py_XDECREF(sep);
13018 return result;
13019}
13020
13021PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013022 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013023\n\
13024Return a list of the words in S, using sep as the\n\
13025delimiter string, starting at the end of the string and\n\
13026working to the front. If maxsplit is given, at most maxsplit\n\
13027splits are done. If sep is not specified, any whitespace string\n\
13028is a separator.");
13029
13030static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013031unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013032{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013033 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013034 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013035 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013036
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013037 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13038 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013039 return NULL;
13040
13041 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013043 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020013044 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013045 else
Victor Stinner9310abb2011-10-05 00:59:23 +020013046 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013047}
13048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013049PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013050 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051\n\
13052Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000013053Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013054is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055
13056static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013057unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013059 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000013060 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013062 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13063 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064 return NULL;
13065
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013066 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067}
13068
13069static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013070PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013072 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073}
13074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013075PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013076 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077\n\
13078Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013079and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080
13081static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013082unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013084 if (PyUnicode_READY(self) == -1)
13085 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013086 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087}
13088
Larry Hastings61272b72014-01-07 12:41:53 -080013089/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013090
Larry Hastings31826802013-10-19 00:09:25 -070013091@staticmethod
13092str.maketrans as unicode_maketrans
13093
13094 x: object
13095
13096 y: unicode=NULL
13097
13098 z: unicode=NULL
13099
13100 /
13101
13102Return a translation table usable for str.translate().
13103
13104If there is only one argument, it must be a dictionary mapping Unicode
13105ordinals (integers) or characters to Unicode ordinals, strings or None.
13106Character keys will be then converted to ordinals.
13107If there are two arguments, they must be strings of equal length, and
13108in the resulting dictionary, each character in x will be mapped to the
13109character at the same position in y. If there is a third argument, it
13110must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013111[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013112
Larry Hastings31826802013-10-19 00:09:25 -070013113static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013114unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013115/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013116{
Georg Brandlceee0772007-11-27 23:48:05 +000013117 PyObject *new = NULL, *key, *value;
13118 Py_ssize_t i = 0;
13119 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013120
Georg Brandlceee0772007-11-27 23:48:05 +000013121 new = PyDict_New();
13122 if (!new)
13123 return NULL;
13124 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 int x_kind, y_kind, z_kind;
13126 void *x_data, *y_data, *z_data;
13127
Georg Brandlceee0772007-11-27 23:48:05 +000013128 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013129 if (!PyUnicode_Check(x)) {
13130 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13131 "be a string if there is a second argument");
13132 goto err;
13133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013135 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13136 "arguments must have equal length");
13137 goto err;
13138 }
13139 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 x_kind = PyUnicode_KIND(x);
13141 y_kind = PyUnicode_KIND(y);
13142 x_data = PyUnicode_DATA(x);
13143 y_data = PyUnicode_DATA(y);
13144 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13145 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013146 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013147 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013148 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013149 if (!value) {
13150 Py_DECREF(key);
13151 goto err;
13152 }
Georg Brandlceee0772007-11-27 23:48:05 +000013153 res = PyDict_SetItem(new, key, value);
13154 Py_DECREF(key);
13155 Py_DECREF(value);
13156 if (res < 0)
13157 goto err;
13158 }
13159 /* create entries for deleting chars in z */
13160 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 z_kind = PyUnicode_KIND(z);
13162 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013163 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013165 if (!key)
13166 goto err;
13167 res = PyDict_SetItem(new, key, Py_None);
13168 Py_DECREF(key);
13169 if (res < 0)
13170 goto err;
13171 }
13172 }
13173 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174 int kind;
13175 void *data;
13176
Georg Brandlceee0772007-11-27 23:48:05 +000013177 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013178 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013179 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13180 "to maketrans it must be a dict");
13181 goto err;
13182 }
13183 /* copy entries into the new dict, converting string keys to int keys */
13184 while (PyDict_Next(x, &i, &key, &value)) {
13185 if (PyUnicode_Check(key)) {
13186 /* convert string keys to integer keys */
13187 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013188 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013189 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13190 "table must be of length 1");
13191 goto err;
13192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 kind = PyUnicode_KIND(key);
13194 data = PyUnicode_DATA(key);
13195 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013196 if (!newkey)
13197 goto err;
13198 res = PyDict_SetItem(new, newkey, value);
13199 Py_DECREF(newkey);
13200 if (res < 0)
13201 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013202 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013203 /* just keep integer keys */
13204 if (PyDict_SetItem(new, key, value) < 0)
13205 goto err;
13206 } else {
13207 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13208 "be strings or integers");
13209 goto err;
13210 }
13211 }
13212 }
13213 return new;
13214 err:
13215 Py_DECREF(new);
13216 return NULL;
13217}
13218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013219PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013222Return a copy of the string S in which each character has been mapped\n\
13223through the given translation table. The table must implement\n\
13224lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13225mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13226this operation raises LookupError, the character is left untouched.\n\
13227Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228
13229static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233}
13234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013235PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013238Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239
13240static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013241unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013243 if (PyUnicode_READY(self) == -1)
13244 return NULL;
13245 if (PyUnicode_IS_ASCII(self))
13246 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013247 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248}
13249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013250PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013251 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013253Pad a numeric string S with zeros on the left, to fill a field\n\
13254of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255
13256static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013257unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013259 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013260 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013261 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013262 int kind;
13263 void *data;
13264 Py_UCS4 chr;
13265
Martin v. Löwis18e16552006-02-15 17:27:45 +000013266 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267 return NULL;
13268
Benjamin Petersonbac79492012-01-14 13:34:47 -050013269 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271
Victor Stinnerc4b49542011-12-11 22:44:26 +010013272 if (PyUnicode_GET_LENGTH(self) >= width)
13273 return unicode_result_unchanged(self);
13274
13275 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276
13277 u = pad(self, fill, 0, '0');
13278
Walter Dörwald068325e2002-04-15 13:36:47 +000013279 if (u == NULL)
13280 return NULL;
13281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282 kind = PyUnicode_KIND(u);
13283 data = PyUnicode_DATA(u);
13284 chr = PyUnicode_READ(kind, data, fill);
13285
13286 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 PyUnicode_WRITE(kind, data, 0, chr);
13289 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290 }
13291
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013292 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013293 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295
13296#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013297static PyObject *
13298unicode__decimal2ascii(PyObject *self)
13299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013301}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302#endif
13303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013304PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013307Return True if S starts with the specified prefix, False otherwise.\n\
13308With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013309With optional end, stop comparing S at that position.\n\
13310prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311
13312static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013313unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013316 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013317 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013318 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013319 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013320 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013321
Jesus Ceaac451502011-04-20 17:09:23 +020013322 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013324 if (PyTuple_Check(subobj)) {
13325 Py_ssize_t i;
13326 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013327 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013328 if (substring == NULL)
13329 return NULL;
13330 result = tailmatch(self, substring, start, end, -1);
13331 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013332 if (result == -1)
13333 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013334 if (result) {
13335 Py_RETURN_TRUE;
13336 }
13337 }
13338 /* nothing matched */
13339 Py_RETURN_FALSE;
13340 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013341 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013342 if (substring == NULL) {
13343 if (PyErr_ExceptionMatches(PyExc_TypeError))
13344 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13345 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013347 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013348 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013350 if (result == -1)
13351 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013352 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353}
13354
13355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013356PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013359Return True if S ends with the specified suffix, False otherwise.\n\
13360With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013361With optional end, stop comparing S at that position.\n\
13362suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363
13364static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013365unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013368 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013369 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013370 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013371 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013372 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373
Jesus Ceaac451502011-04-20 17:09:23 +020013374 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013376 if (PyTuple_Check(subobj)) {
13377 Py_ssize_t i;
13378 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013379 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013381 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 result = tailmatch(self, substring, start, end, +1);
13384 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013385 if (result == -1)
13386 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 if (result) {
13388 Py_RETURN_TRUE;
13389 }
13390 }
13391 Py_RETURN_FALSE;
13392 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013393 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013394 if (substring == NULL) {
13395 if (PyErr_ExceptionMatches(PyExc_TypeError))
13396 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13397 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013399 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013401 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013402 if (result == -1)
13403 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013404 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405}
13406
Victor Stinner202fdca2012-05-07 12:47:02 +020013407Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013408_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013409{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013410 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13411 writer->data = PyUnicode_DATA(writer->buffer);
13412
13413 if (!writer->readonly) {
13414 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013415 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013416 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013417 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013418 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13419 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13420 writer->kind = PyUnicode_WCHAR_KIND;
13421 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13422
Victor Stinner8f674cc2013-04-17 23:02:17 +020013423 /* Copy-on-write mode: set buffer size to 0 so
13424 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13425 * next write. */
13426 writer->size = 0;
13427 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013428}
13429
Victor Stinnerd3f08822012-05-29 12:57:52 +020013430void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013431_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013432{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013433 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013434
13435 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013436 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013437
13438 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13439 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13440 writer->kind = PyUnicode_WCHAR_KIND;
13441 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013442}
13443
Victor Stinnerd3f08822012-05-29 12:57:52 +020013444int
13445_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13446 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013447{
13448 Py_ssize_t newlen;
13449 PyObject *newbuffer;
13450
Victor Stinnerca9381e2015-09-22 00:58:32 +020013451 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013452 assert((maxchar > writer->maxchar && length >= 0)
13453 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013454
Victor Stinner202fdca2012-05-07 12:47:02 +020013455 if (length > PY_SSIZE_T_MAX - writer->pos) {
13456 PyErr_NoMemory();
13457 return -1;
13458 }
13459 newlen = writer->pos + length;
13460
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013461 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013462
Victor Stinnerd3f08822012-05-29 12:57:52 +020013463 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013464 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013465 if (writer->overallocate
13466 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13467 /* overallocate to limit the number of realloc() */
13468 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013469 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013470 if (newlen < writer->min_length)
13471 newlen = writer->min_length;
13472
Victor Stinnerd3f08822012-05-29 12:57:52 +020013473 writer->buffer = PyUnicode_New(newlen, maxchar);
13474 if (writer->buffer == NULL)
13475 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013476 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013477 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013478 if (writer->overallocate
13479 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13480 /* overallocate to limit the number of realloc() */
13481 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013482 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013483 if (newlen < writer->min_length)
13484 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013486 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013487 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013488 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013489 newbuffer = PyUnicode_New(newlen, maxchar);
13490 if (newbuffer == NULL)
13491 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013492 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13493 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013494 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013495 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013496 }
13497 else {
13498 newbuffer = resize_compact(writer->buffer, newlen);
13499 if (newbuffer == NULL)
13500 return -1;
13501 }
13502 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013503 }
13504 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013505 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 newbuffer = PyUnicode_New(writer->size, maxchar);
13507 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013508 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13510 writer->buffer, 0, writer->pos);
13511 Py_DECREF(writer->buffer);
13512 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013513 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013514 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013515 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013516
13517#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013518}
13519
Victor Stinnerca9381e2015-09-22 00:58:32 +020013520int
13521_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13522 enum PyUnicode_Kind kind)
13523{
13524 Py_UCS4 maxchar;
13525
13526 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13527 assert(writer->kind < kind);
13528
13529 switch (kind)
13530 {
13531 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13532 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13533 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13534 default:
13535 assert(0 && "invalid kind");
13536 return -1;
13537 }
13538
13539 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13540}
13541
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013542Py_LOCAL_INLINE(int)
13543_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013544{
13545 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13546 return -1;
13547 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13548 writer->pos++;
13549 return 0;
13550}
13551
13552int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013553_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13554{
13555 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13556}
13557
13558int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013559_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13560{
13561 Py_UCS4 maxchar;
13562 Py_ssize_t len;
13563
13564 if (PyUnicode_READY(str) == -1)
13565 return -1;
13566 len = PyUnicode_GET_LENGTH(str);
13567 if (len == 0)
13568 return 0;
13569 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13570 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013571 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013572 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013573 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013574 Py_INCREF(str);
13575 writer->buffer = str;
13576 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 writer->pos += len;
13578 return 0;
13579 }
13580 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13581 return -1;
13582 }
13583 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13584 str, 0, len);
13585 writer->pos += len;
13586 return 0;
13587}
13588
Victor Stinnere215d962012-10-06 23:03:36 +020013589int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013590_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13591 Py_ssize_t start, Py_ssize_t end)
13592{
13593 Py_UCS4 maxchar;
13594 Py_ssize_t len;
13595
13596 if (PyUnicode_READY(str) == -1)
13597 return -1;
13598
13599 assert(0 <= start);
13600 assert(end <= PyUnicode_GET_LENGTH(str));
13601 assert(start <= end);
13602
13603 if (end == 0)
13604 return 0;
13605
13606 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13607 return _PyUnicodeWriter_WriteStr(writer, str);
13608
13609 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13610 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13611 else
13612 maxchar = writer->maxchar;
13613 len = end - start;
13614
13615 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13616 return -1;
13617
13618 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13619 str, start, len);
13620 writer->pos += len;
13621 return 0;
13622}
13623
13624int
Victor Stinner4a587072013-11-19 12:54:53 +010013625_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13626 const char *ascii, Py_ssize_t len)
13627{
13628 if (len == -1)
13629 len = strlen(ascii);
13630
13631 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13632
13633 if (writer->buffer == NULL && !writer->overallocate) {
13634 PyObject *str;
13635
13636 str = _PyUnicode_FromASCII(ascii, len);
13637 if (str == NULL)
13638 return -1;
13639
13640 writer->readonly = 1;
13641 writer->buffer = str;
13642 _PyUnicodeWriter_Update(writer);
13643 writer->pos += len;
13644 return 0;
13645 }
13646
13647 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13648 return -1;
13649
13650 switch (writer->kind)
13651 {
13652 case PyUnicode_1BYTE_KIND:
13653 {
13654 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13655 Py_UCS1 *data = writer->data;
13656
13657 Py_MEMCPY(data + writer->pos, str, len);
13658 break;
13659 }
13660 case PyUnicode_2BYTE_KIND:
13661 {
13662 _PyUnicode_CONVERT_BYTES(
13663 Py_UCS1, Py_UCS2,
13664 ascii, ascii + len,
13665 (Py_UCS2 *)writer->data + writer->pos);
13666 break;
13667 }
13668 case PyUnicode_4BYTE_KIND:
13669 {
13670 _PyUnicode_CONVERT_BYTES(
13671 Py_UCS1, Py_UCS4,
13672 ascii, ascii + len,
13673 (Py_UCS4 *)writer->data + writer->pos);
13674 break;
13675 }
13676 default:
13677 assert(0);
13678 }
13679
13680 writer->pos += len;
13681 return 0;
13682}
13683
13684int
13685_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13686 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013687{
13688 Py_UCS4 maxchar;
13689
13690 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13691 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13692 return -1;
13693 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13694 writer->pos += len;
13695 return 0;
13696}
13697
Victor Stinnerd3f08822012-05-29 12:57:52 +020013698PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013699_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013700{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013701 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013702 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013703 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013704 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013705 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013706 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013707 str = writer->buffer;
13708 writer->buffer = NULL;
13709 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13710 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013711 }
13712 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13713 PyObject *newbuffer;
13714 newbuffer = resize_compact(writer->buffer, writer->pos);
13715 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013716 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013717 return NULL;
13718 }
13719 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013720 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013721 str = writer->buffer;
13722 writer->buffer = NULL;
13723 assert(_PyUnicode_CheckConsistency(str, 1));
13724 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013725}
13726
Victor Stinnerd3f08822012-05-29 12:57:52 +020013727void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013728_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013729{
13730 Py_CLEAR(writer->buffer);
13731}
13732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013733#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013734
13735PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013736 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013737\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013738Return a formatted version of S, using substitutions from args and kwargs.\n\
13739The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013740
Eric Smith27bbca62010-11-04 17:06:58 +000013741PyDoc_STRVAR(format_map__doc__,
13742 "S.format_map(mapping) -> str\n\
13743\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013744Return a formatted version of S, using substitutions from mapping.\n\
13745The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013746
Eric Smith4a7d76d2008-05-30 18:10:19 +000013747static PyObject *
13748unicode__format__(PyObject* self, PyObject* args)
13749{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013750 PyObject *format_spec;
13751 _PyUnicodeWriter writer;
13752 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013753
13754 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13755 return NULL;
13756
Victor Stinnerd3f08822012-05-29 12:57:52 +020013757 if (PyUnicode_READY(self) == -1)
13758 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013759 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013760 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13761 self, format_spec, 0,
13762 PyUnicode_GET_LENGTH(format_spec));
13763 if (ret == -1) {
13764 _PyUnicodeWriter_Dealloc(&writer);
13765 return NULL;
13766 }
13767 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013768}
13769
Eric Smith8c663262007-08-25 02:26:07 +000013770PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013772\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013773Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013774
13775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013776unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013778 Py_ssize_t size;
13779
13780 /* If it's a compact object, account for base structure +
13781 character data. */
13782 if (PyUnicode_IS_COMPACT_ASCII(v))
13783 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13784 else if (PyUnicode_IS_COMPACT(v))
13785 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013786 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 else {
13788 /* If it is a two-block object, account for base object, and
13789 for character block if present. */
13790 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013791 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013792 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013793 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013794 }
13795 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013796 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013797 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013798 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013799 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013800 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801
13802 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013803}
13804
13805PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013806 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013807
13808static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013809unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013810{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013811 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 if (!copy)
13813 return NULL;
13814 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013815}
13816
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013818 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013819 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013820 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13821 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013822 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13823 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013824 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013825 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13826 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13827 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013828 {"expandtabs", (PyCFunction) unicode_expandtabs,
13829 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013830 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013831 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013832 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13833 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13834 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013835 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013836 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13837 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13838 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013839 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013840 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013841 {"splitlines", (PyCFunction) unicode_splitlines,
13842 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013843 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013844 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13845 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13846 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13847 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13848 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13849 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13850 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13851 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13852 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13853 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13854 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13855 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13856 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13857 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013858 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013859 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013860 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013861 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013862 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013863 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013864 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013865 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013866#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013867 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013868 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013869#endif
13870
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872 {NULL, NULL}
13873};
13874
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013875static PyObject *
13876unicode_mod(PyObject *v, PyObject *w)
13877{
Brian Curtindfc80e32011-08-10 20:28:54 -050013878 if (!PyUnicode_Check(v))
13879 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013880 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013881}
13882
13883static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013884 0, /*nb_add*/
13885 0, /*nb_subtract*/
13886 0, /*nb_multiply*/
13887 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013888};
13889
Guido van Rossumd57fd912000-03-10 22:53:23 +000013890static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013891 (lenfunc) unicode_length, /* sq_length */
13892 PyUnicode_Concat, /* sq_concat */
13893 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13894 (ssizeargfunc) unicode_getitem, /* sq_item */
13895 0, /* sq_slice */
13896 0, /* sq_ass_item */
13897 0, /* sq_ass_slice */
13898 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013899};
13900
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013901static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013902unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013904 if (PyUnicode_READY(self) == -1)
13905 return NULL;
13906
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013907 if (PyIndex_Check(item)) {
13908 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013909 if (i == -1 && PyErr_Occurred())
13910 return NULL;
13911 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013912 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013913 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013914 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013915 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013916 PyObject *result;
13917 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013918 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013919 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013921 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013923 return NULL;
13924 }
13925
13926 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013927 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013928 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013929 slicelength == PyUnicode_GET_LENGTH(self)) {
13930 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013931 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013932 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013933 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013934 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013935 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013936 src_kind = PyUnicode_KIND(self);
13937 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013938 if (!PyUnicode_IS_ASCII(self)) {
13939 kind_limit = kind_maxchar_limit(src_kind);
13940 max_char = 0;
13941 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13942 ch = PyUnicode_READ(src_kind, src_data, cur);
13943 if (ch > max_char) {
13944 max_char = ch;
13945 if (max_char >= kind_limit)
13946 break;
13947 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013948 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013949 }
Victor Stinner55c99112011-10-13 01:17:06 +020013950 else
13951 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013952 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013953 if (result == NULL)
13954 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013955 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013956 dest_data = PyUnicode_DATA(result);
13957
13958 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013959 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13960 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013961 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013962 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013963 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013964 } else {
13965 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13966 return NULL;
13967 }
13968}
13969
13970static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 (lenfunc)unicode_length, /* mp_length */
13972 (binaryfunc)unicode_subscript, /* mp_subscript */
13973 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013974};
13975
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976
Guido van Rossumd57fd912000-03-10 22:53:23 +000013977/* Helpers for PyUnicode_Format() */
13978
Victor Stinnera47082312012-10-04 02:19:54 +020013979struct unicode_formatter_t {
13980 PyObject *args;
13981 int args_owned;
13982 Py_ssize_t arglen, argidx;
13983 PyObject *dict;
13984
13985 enum PyUnicode_Kind fmtkind;
13986 Py_ssize_t fmtcnt, fmtpos;
13987 void *fmtdata;
13988 PyObject *fmtstr;
13989
13990 _PyUnicodeWriter writer;
13991};
13992
13993struct unicode_format_arg_t {
13994 Py_UCS4 ch;
13995 int flags;
13996 Py_ssize_t width;
13997 int prec;
13998 int sign;
13999};
14000
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014002unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014003{
Victor Stinnera47082312012-10-04 02:19:54 +020014004 Py_ssize_t argidx = ctx->argidx;
14005
14006 if (argidx < ctx->arglen) {
14007 ctx->argidx++;
14008 if (ctx->arglen < 0)
14009 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014010 else
Victor Stinnera47082312012-10-04 02:19:54 +020014011 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014012 }
14013 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014014 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014015 return NULL;
14016}
14017
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014018/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014019
Victor Stinnera47082312012-10-04 02:19:54 +020014020/* Format a float into the writer if the writer is not NULL, or into *p_output
14021 otherwise.
14022
14023 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014024static int
Victor Stinnera47082312012-10-04 02:19:54 +020014025formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14026 PyObject **p_output,
14027 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014028{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014029 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014030 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014031 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014032 int prec;
14033 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014034
Guido van Rossumd57fd912000-03-10 22:53:23 +000014035 x = PyFloat_AsDouble(v);
14036 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014037 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014038
Victor Stinnera47082312012-10-04 02:19:54 +020014039 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014040 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014041 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014042
Victor Stinnera47082312012-10-04 02:19:54 +020014043 if (arg->flags & F_ALT)
14044 dtoa_flags = Py_DTSF_ALT;
14045 else
14046 dtoa_flags = 0;
14047 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014048 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014049 return -1;
14050 len = strlen(p);
14051 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014052 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014053 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014054 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014055 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014056 }
14057 else
14058 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014059 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014060 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014061}
14062
Victor Stinnerd0880d52012-04-27 23:40:13 +020014063/* formatlong() emulates the format codes d, u, o, x and X, and
14064 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14065 * Python's regular ints.
14066 * Return value: a new PyUnicodeObject*, or NULL if error.
14067 * The output string is of the form
14068 * "-"? ("0x" | "0X")? digit+
14069 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14070 * set in flags. The case of hex digits will be correct,
14071 * There will be at least prec digits, zero-filled on the left if
14072 * necessary to get that many.
14073 * val object to be converted
14074 * flags bitmask of format flags; only F_ALT is looked at
14075 * prec minimum number of digits; 0-fill on left if needed
14076 * type a character in [duoxX]; u acts the same as d
14077 *
14078 * CAUTION: o, x and X conversions on regular ints can never
14079 * produce a '-' sign, but can for Python's unbounded ints.
14080 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014081PyObject *
14082_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014083{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014084 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014085 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014086 Py_ssize_t i;
14087 int sign; /* 1 if '-', else 0 */
14088 int len; /* number of characters */
14089 Py_ssize_t llen;
14090 int numdigits; /* len == numnondigits + numdigits */
14091 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014092
Victor Stinnerd0880d52012-04-27 23:40:13 +020014093 /* Avoid exceeding SSIZE_T_MAX */
14094 if (prec > INT_MAX-3) {
14095 PyErr_SetString(PyExc_OverflowError,
14096 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014097 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014098 }
14099
14100 assert(PyLong_Check(val));
14101
14102 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014103 default:
14104 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014105 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014106 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014107 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014108 /* int and int subclasses should print numerically when a numeric */
14109 /* format code is used (see issue18780) */
14110 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014111 break;
14112 case 'o':
14113 numnondigits = 2;
14114 result = PyNumber_ToBase(val, 8);
14115 break;
14116 case 'x':
14117 case 'X':
14118 numnondigits = 2;
14119 result = PyNumber_ToBase(val, 16);
14120 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014121 }
14122 if (!result)
14123 return NULL;
14124
14125 assert(unicode_modifiable(result));
14126 assert(PyUnicode_IS_READY(result));
14127 assert(PyUnicode_IS_ASCII(result));
14128
14129 /* To modify the string in-place, there can only be one reference. */
14130 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014131 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014132 PyErr_BadInternalCall();
14133 return NULL;
14134 }
14135 buf = PyUnicode_DATA(result);
14136 llen = PyUnicode_GET_LENGTH(result);
14137 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014138 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014139 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014140 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014141 return NULL;
14142 }
14143 len = (int)llen;
14144 sign = buf[0] == '-';
14145 numnondigits += sign;
14146 numdigits = len - numnondigits;
14147 assert(numdigits > 0);
14148
14149 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014150 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014151 (type == 'o' || type == 'x' || type == 'X'))) {
14152 assert(buf[sign] == '0');
14153 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14154 buf[sign+1] == 'o');
14155 numnondigits -= 2;
14156 buf += 2;
14157 len -= 2;
14158 if (sign)
14159 buf[0] = '-';
14160 assert(len == numnondigits + numdigits);
14161 assert(numdigits > 0);
14162 }
14163
14164 /* Fill with leading zeroes to meet minimum width. */
14165 if (prec > numdigits) {
14166 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14167 numnondigits + prec);
14168 char *b1;
14169 if (!r1) {
14170 Py_DECREF(result);
14171 return NULL;
14172 }
14173 b1 = PyBytes_AS_STRING(r1);
14174 for (i = 0; i < numnondigits; ++i)
14175 *b1++ = *buf++;
14176 for (i = 0; i < prec - numdigits; i++)
14177 *b1++ = '0';
14178 for (i = 0; i < numdigits; i++)
14179 *b1++ = *buf++;
14180 *b1 = '\0';
14181 Py_DECREF(result);
14182 result = r1;
14183 buf = PyBytes_AS_STRING(result);
14184 len = numnondigits + prec;
14185 }
14186
14187 /* Fix up case for hex conversions. */
14188 if (type == 'X') {
14189 /* Need to convert all lower case letters to upper case.
14190 and need to convert 0x to 0X (and -0x to -0X). */
14191 for (i = 0; i < len; i++)
14192 if (buf[i] >= 'a' && buf[i] <= 'x')
14193 buf[i] -= 'a'-'A';
14194 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014195 if (!PyUnicode_Check(result)
14196 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014197 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014198 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 Py_DECREF(result);
14200 result = unicode;
14201 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014202 else if (len != PyUnicode_GET_LENGTH(result)) {
14203 if (PyUnicode_Resize(&result, len) < 0)
14204 Py_CLEAR(result);
14205 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014206 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014207}
14208
Ethan Furmandf3ed242014-01-05 06:50:30 -080014209/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014210 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014211 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014212 * -1 and raise an exception on error */
14213static int
Victor Stinnera47082312012-10-04 02:19:54 +020014214mainformatlong(PyObject *v,
14215 struct unicode_format_arg_t *arg,
14216 PyObject **p_output,
14217 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014218{
14219 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014220 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014221
14222 if (!PyNumber_Check(v))
14223 goto wrongtype;
14224
Ethan Furman9ab74802014-03-21 06:38:46 -070014225 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014226 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014227 if (type == 'o' || type == 'x' || type == 'X') {
14228 iobj = PyNumber_Index(v);
14229 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014230 if (PyErr_ExceptionMatches(PyExc_TypeError))
14231 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014232 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014233 }
14234 }
14235 else {
14236 iobj = PyNumber_Long(v);
14237 if (iobj == NULL ) {
14238 if (PyErr_ExceptionMatches(PyExc_TypeError))
14239 goto wrongtype;
14240 return -1;
14241 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014242 }
14243 assert(PyLong_Check(iobj));
14244 }
14245 else {
14246 iobj = v;
14247 Py_INCREF(iobj);
14248 }
14249
14250 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014251 && arg->width == -1 && arg->prec == -1
14252 && !(arg->flags & (F_SIGN | F_BLANK))
14253 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014254 {
14255 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014256 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014257 int base;
14258
Victor Stinnera47082312012-10-04 02:19:54 +020014259 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014260 {
14261 default:
14262 assert(0 && "'type' not in [diuoxX]");
14263 case 'd':
14264 case 'i':
14265 case 'u':
14266 base = 10;
14267 break;
14268 case 'o':
14269 base = 8;
14270 break;
14271 case 'x':
14272 case 'X':
14273 base = 16;
14274 break;
14275 }
14276
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014277 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14278 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014279 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014280 }
14281 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014282 return 1;
14283 }
14284
Ethan Furmanb95b5612015-01-23 20:05:18 -080014285 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014286 Py_DECREF(iobj);
14287 if (res == NULL)
14288 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014289 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014290 return 0;
14291
14292wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014293 switch(type)
14294 {
14295 case 'o':
14296 case 'x':
14297 case 'X':
14298 PyErr_Format(PyExc_TypeError,
14299 "%%%c format: an integer is required, "
14300 "not %.200s",
14301 type, Py_TYPE(v)->tp_name);
14302 break;
14303 default:
14304 PyErr_Format(PyExc_TypeError,
14305 "%%%c format: a number is required, "
14306 "not %.200s",
14307 type, Py_TYPE(v)->tp_name);
14308 break;
14309 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014310 return -1;
14311}
14312
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014313static Py_UCS4
14314formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014315{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014316 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014317 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014318 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014319 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014320 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014321 goto onError;
14322 }
14323 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014324 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014325 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014326 /* make sure number is a type of integer */
14327 if (!PyLong_Check(v)) {
14328 iobj = PyNumber_Index(v);
14329 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014330 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014331 }
14332 v = iobj;
14333 Py_DECREF(iobj);
14334 }
14335 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014336 x = PyLong_AsLong(v);
14337 if (x == -1 && PyErr_Occurred())
14338 goto onError;
14339
Victor Stinner8faf8212011-12-08 22:14:11 +010014340 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014341 PyErr_SetString(PyExc_OverflowError,
14342 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014343 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014344 }
14345
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014346 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014347 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014348
Benjamin Peterson29060642009-01-31 22:14:21 +000014349 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014350 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014351 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014352 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014353}
14354
Victor Stinnera47082312012-10-04 02:19:54 +020014355/* Parse options of an argument: flags, width, precision.
14356 Handle also "%(name)" syntax.
14357
14358 Return 0 if the argument has been formatted into arg->str.
14359 Return 1 if the argument has been written into ctx->writer,
14360 Raise an exception and return -1 on error. */
14361static int
14362unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14363 struct unicode_format_arg_t *arg)
14364{
14365#define FORMAT_READ(ctx) \
14366 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14367
14368 PyObject *v;
14369
Victor Stinnera47082312012-10-04 02:19:54 +020014370 if (arg->ch == '(') {
14371 /* Get argument value from a dictionary. Example: "%(name)s". */
14372 Py_ssize_t keystart;
14373 Py_ssize_t keylen;
14374 PyObject *key;
14375 int pcount = 1;
14376
14377 if (ctx->dict == NULL) {
14378 PyErr_SetString(PyExc_TypeError,
14379 "format requires a mapping");
14380 return -1;
14381 }
14382 ++ctx->fmtpos;
14383 --ctx->fmtcnt;
14384 keystart = ctx->fmtpos;
14385 /* Skip over balanced parentheses */
14386 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14387 arg->ch = FORMAT_READ(ctx);
14388 if (arg->ch == ')')
14389 --pcount;
14390 else if (arg->ch == '(')
14391 ++pcount;
14392 ctx->fmtpos++;
14393 }
14394 keylen = ctx->fmtpos - keystart - 1;
14395 if (ctx->fmtcnt < 0 || pcount > 0) {
14396 PyErr_SetString(PyExc_ValueError,
14397 "incomplete format key");
14398 return -1;
14399 }
14400 key = PyUnicode_Substring(ctx->fmtstr,
14401 keystart, keystart + keylen);
14402 if (key == NULL)
14403 return -1;
14404 if (ctx->args_owned) {
14405 Py_DECREF(ctx->args);
14406 ctx->args_owned = 0;
14407 }
14408 ctx->args = PyObject_GetItem(ctx->dict, key);
14409 Py_DECREF(key);
14410 if (ctx->args == NULL)
14411 return -1;
14412 ctx->args_owned = 1;
14413 ctx->arglen = -1;
14414 ctx->argidx = -2;
14415 }
14416
14417 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014418 while (--ctx->fmtcnt >= 0) {
14419 arg->ch = FORMAT_READ(ctx);
14420 ctx->fmtpos++;
14421 switch (arg->ch) {
14422 case '-': arg->flags |= F_LJUST; continue;
14423 case '+': arg->flags |= F_SIGN; continue;
14424 case ' ': arg->flags |= F_BLANK; continue;
14425 case '#': arg->flags |= F_ALT; continue;
14426 case '0': arg->flags |= F_ZERO; continue;
14427 }
14428 break;
14429 }
14430
14431 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014432 if (arg->ch == '*') {
14433 v = unicode_format_getnextarg(ctx);
14434 if (v == NULL)
14435 return -1;
14436 if (!PyLong_Check(v)) {
14437 PyErr_SetString(PyExc_TypeError,
14438 "* wants int");
14439 return -1;
14440 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014441 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014442 if (arg->width == -1 && PyErr_Occurred())
14443 return -1;
14444 if (arg->width < 0) {
14445 arg->flags |= F_LJUST;
14446 arg->width = -arg->width;
14447 }
14448 if (--ctx->fmtcnt >= 0) {
14449 arg->ch = FORMAT_READ(ctx);
14450 ctx->fmtpos++;
14451 }
14452 }
14453 else if (arg->ch >= '0' && arg->ch <= '9') {
14454 arg->width = arg->ch - '0';
14455 while (--ctx->fmtcnt >= 0) {
14456 arg->ch = FORMAT_READ(ctx);
14457 ctx->fmtpos++;
14458 if (arg->ch < '0' || arg->ch > '9')
14459 break;
14460 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14461 mixing signed and unsigned comparison. Since arg->ch is between
14462 '0' and '9', casting to int is safe. */
14463 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14464 PyErr_SetString(PyExc_ValueError,
14465 "width too big");
14466 return -1;
14467 }
14468 arg->width = arg->width*10 + (arg->ch - '0');
14469 }
14470 }
14471
14472 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014473 if (arg->ch == '.') {
14474 arg->prec = 0;
14475 if (--ctx->fmtcnt >= 0) {
14476 arg->ch = FORMAT_READ(ctx);
14477 ctx->fmtpos++;
14478 }
14479 if (arg->ch == '*') {
14480 v = unicode_format_getnextarg(ctx);
14481 if (v == NULL)
14482 return -1;
14483 if (!PyLong_Check(v)) {
14484 PyErr_SetString(PyExc_TypeError,
14485 "* wants int");
14486 return -1;
14487 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014488 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014489 if (arg->prec == -1 && PyErr_Occurred())
14490 return -1;
14491 if (arg->prec < 0)
14492 arg->prec = 0;
14493 if (--ctx->fmtcnt >= 0) {
14494 arg->ch = FORMAT_READ(ctx);
14495 ctx->fmtpos++;
14496 }
14497 }
14498 else if (arg->ch >= '0' && arg->ch <= '9') {
14499 arg->prec = arg->ch - '0';
14500 while (--ctx->fmtcnt >= 0) {
14501 arg->ch = FORMAT_READ(ctx);
14502 ctx->fmtpos++;
14503 if (arg->ch < '0' || arg->ch > '9')
14504 break;
14505 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14506 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014507 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014508 return -1;
14509 }
14510 arg->prec = arg->prec*10 + (arg->ch - '0');
14511 }
14512 }
14513 }
14514
14515 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14516 if (ctx->fmtcnt >= 0) {
14517 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14518 if (--ctx->fmtcnt >= 0) {
14519 arg->ch = FORMAT_READ(ctx);
14520 ctx->fmtpos++;
14521 }
14522 }
14523 }
14524 if (ctx->fmtcnt < 0) {
14525 PyErr_SetString(PyExc_ValueError,
14526 "incomplete format");
14527 return -1;
14528 }
14529 return 0;
14530
14531#undef FORMAT_READ
14532}
14533
14534/* Format one argument. Supported conversion specifiers:
14535
14536 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014537 - "i", "d", "u": int or float
14538 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014539 - "e", "E", "f", "F", "g", "G": float
14540 - "c": int or str (1 character)
14541
Victor Stinner8dbd4212012-12-04 09:30:24 +010014542 When possible, the output is written directly into the Unicode writer
14543 (ctx->writer). A string is created when padding is required.
14544
Victor Stinnera47082312012-10-04 02:19:54 +020014545 Return 0 if the argument has been formatted into *p_str,
14546 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014547 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014548static int
14549unicode_format_arg_format(struct unicode_formatter_t *ctx,
14550 struct unicode_format_arg_t *arg,
14551 PyObject **p_str)
14552{
14553 PyObject *v;
14554 _PyUnicodeWriter *writer = &ctx->writer;
14555
14556 if (ctx->fmtcnt == 0)
14557 ctx->writer.overallocate = 0;
14558
14559 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014560 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014561 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014562 return 1;
14563 }
14564
14565 v = unicode_format_getnextarg(ctx);
14566 if (v == NULL)
14567 return -1;
14568
Victor Stinnera47082312012-10-04 02:19:54 +020014569
14570 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014571 case 's':
14572 case 'r':
14573 case 'a':
14574 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14575 /* Fast path */
14576 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14577 return -1;
14578 return 1;
14579 }
14580
14581 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14582 *p_str = v;
14583 Py_INCREF(*p_str);
14584 }
14585 else {
14586 if (arg->ch == 's')
14587 *p_str = PyObject_Str(v);
14588 else if (arg->ch == 'r')
14589 *p_str = PyObject_Repr(v);
14590 else
14591 *p_str = PyObject_ASCII(v);
14592 }
14593 break;
14594
14595 case 'i':
14596 case 'd':
14597 case 'u':
14598 case 'o':
14599 case 'x':
14600 case 'X':
14601 {
14602 int ret = mainformatlong(v, arg, p_str, writer);
14603 if (ret != 0)
14604 return ret;
14605 arg->sign = 1;
14606 break;
14607 }
14608
14609 case 'e':
14610 case 'E':
14611 case 'f':
14612 case 'F':
14613 case 'g':
14614 case 'G':
14615 if (arg->width == -1 && arg->prec == -1
14616 && !(arg->flags & (F_SIGN | F_BLANK)))
14617 {
14618 /* Fast path */
14619 if (formatfloat(v, arg, NULL, writer) == -1)
14620 return -1;
14621 return 1;
14622 }
14623
14624 arg->sign = 1;
14625 if (formatfloat(v, arg, p_str, NULL) == -1)
14626 return -1;
14627 break;
14628
14629 case 'c':
14630 {
14631 Py_UCS4 ch = formatchar(v);
14632 if (ch == (Py_UCS4) -1)
14633 return -1;
14634 if (arg->width == -1 && arg->prec == -1) {
14635 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014636 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014637 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014638 return 1;
14639 }
14640 *p_str = PyUnicode_FromOrdinal(ch);
14641 break;
14642 }
14643
14644 default:
14645 PyErr_Format(PyExc_ValueError,
14646 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014647 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014648 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14649 (int)arg->ch,
14650 ctx->fmtpos - 1);
14651 return -1;
14652 }
14653 if (*p_str == NULL)
14654 return -1;
14655 assert (PyUnicode_Check(*p_str));
14656 return 0;
14657}
14658
14659static int
14660unicode_format_arg_output(struct unicode_formatter_t *ctx,
14661 struct unicode_format_arg_t *arg,
14662 PyObject *str)
14663{
14664 Py_ssize_t len;
14665 enum PyUnicode_Kind kind;
14666 void *pbuf;
14667 Py_ssize_t pindex;
14668 Py_UCS4 signchar;
14669 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014670 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014671 Py_ssize_t sublen;
14672 _PyUnicodeWriter *writer = &ctx->writer;
14673 Py_UCS4 fill;
14674
14675 fill = ' ';
14676 if (arg->sign && arg->flags & F_ZERO)
14677 fill = '0';
14678
14679 if (PyUnicode_READY(str) == -1)
14680 return -1;
14681
14682 len = PyUnicode_GET_LENGTH(str);
14683 if ((arg->width == -1 || arg->width <= len)
14684 && (arg->prec == -1 || arg->prec >= len)
14685 && !(arg->flags & (F_SIGN | F_BLANK)))
14686 {
14687 /* Fast path */
14688 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14689 return -1;
14690 return 0;
14691 }
14692
14693 /* Truncate the string for "s", "r" and "a" formats
14694 if the precision is set */
14695 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14696 if (arg->prec >= 0 && len > arg->prec)
14697 len = arg->prec;
14698 }
14699
14700 /* Adjust sign and width */
14701 kind = PyUnicode_KIND(str);
14702 pbuf = PyUnicode_DATA(str);
14703 pindex = 0;
14704 signchar = '\0';
14705 if (arg->sign) {
14706 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14707 if (ch == '-' || ch == '+') {
14708 signchar = ch;
14709 len--;
14710 pindex++;
14711 }
14712 else if (arg->flags & F_SIGN)
14713 signchar = '+';
14714 else if (arg->flags & F_BLANK)
14715 signchar = ' ';
14716 else
14717 arg->sign = 0;
14718 }
14719 if (arg->width < len)
14720 arg->width = len;
14721
14722 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014723 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014724 if (!(arg->flags & F_LJUST)) {
14725 if (arg->sign) {
14726 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014727 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014728 }
14729 else {
14730 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014731 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014732 }
14733 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014734 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14735 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014736 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014737 }
14738
Victor Stinnera47082312012-10-04 02:19:54 +020014739 buflen = arg->width;
14740 if (arg->sign && len == arg->width)
14741 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014742 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014743 return -1;
14744
14745 /* Write the sign if needed */
14746 if (arg->sign) {
14747 if (fill != ' ') {
14748 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14749 writer->pos += 1;
14750 }
14751 if (arg->width > len)
14752 arg->width--;
14753 }
14754
14755 /* Write the numeric prefix for "x", "X" and "o" formats
14756 if the alternate form is used.
14757 For example, write "0x" for the "%#x" format. */
14758 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14759 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14760 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14761 if (fill != ' ') {
14762 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14763 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14764 writer->pos += 2;
14765 pindex += 2;
14766 }
14767 arg->width -= 2;
14768 if (arg->width < 0)
14769 arg->width = 0;
14770 len -= 2;
14771 }
14772
14773 /* Pad left with the fill character if needed */
14774 if (arg->width > len && !(arg->flags & F_LJUST)) {
14775 sublen = arg->width - len;
14776 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14777 writer->pos += sublen;
14778 arg->width = len;
14779 }
14780
14781 /* If padding with spaces: write sign if needed and/or numeric prefix if
14782 the alternate form is used */
14783 if (fill == ' ') {
14784 if (arg->sign) {
14785 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14786 writer->pos += 1;
14787 }
14788 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14789 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14790 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14791 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14792 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14793 writer->pos += 2;
14794 pindex += 2;
14795 }
14796 }
14797
14798 /* Write characters */
14799 if (len) {
14800 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14801 str, pindex, len);
14802 writer->pos += len;
14803 }
14804
14805 /* Pad right with the fill character if needed */
14806 if (arg->width > len) {
14807 sublen = arg->width - len;
14808 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14809 writer->pos += sublen;
14810 }
14811 return 0;
14812}
14813
14814/* Helper of PyUnicode_Format(): format one arg.
14815 Return 0 on success, raise an exception and return -1 on error. */
14816static int
14817unicode_format_arg(struct unicode_formatter_t *ctx)
14818{
14819 struct unicode_format_arg_t arg;
14820 PyObject *str;
14821 int ret;
14822
Victor Stinner8dbd4212012-12-04 09:30:24 +010014823 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14824 arg.flags = 0;
14825 arg.width = -1;
14826 arg.prec = -1;
14827 arg.sign = 0;
14828 str = NULL;
14829
Victor Stinnera47082312012-10-04 02:19:54 +020014830 ret = unicode_format_arg_parse(ctx, &arg);
14831 if (ret == -1)
14832 return -1;
14833
14834 ret = unicode_format_arg_format(ctx, &arg, &str);
14835 if (ret == -1)
14836 return -1;
14837
14838 if (ret != 1) {
14839 ret = unicode_format_arg_output(ctx, &arg, str);
14840 Py_DECREF(str);
14841 if (ret == -1)
14842 return -1;
14843 }
14844
14845 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14846 PyErr_SetString(PyExc_TypeError,
14847 "not all arguments converted during string formatting");
14848 return -1;
14849 }
14850 return 0;
14851}
14852
Alexander Belopolsky40018472011-02-26 01:02:56 +000014853PyObject *
14854PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014855{
Victor Stinnera47082312012-10-04 02:19:54 +020014856 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014857
Guido van Rossumd57fd912000-03-10 22:53:23 +000014858 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014859 PyErr_BadInternalCall();
14860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014861 }
Victor Stinnera47082312012-10-04 02:19:54 +020014862
14863 ctx.fmtstr = PyUnicode_FromObject(format);
14864 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014865 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014866 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14867 Py_DECREF(ctx.fmtstr);
14868 return NULL;
14869 }
14870 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14871 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14872 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14873 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014874
Victor Stinner8f674cc2013-04-17 23:02:17 +020014875 _PyUnicodeWriter_Init(&ctx.writer);
14876 ctx.writer.min_length = ctx.fmtcnt + 100;
14877 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014878
Guido van Rossumd57fd912000-03-10 22:53:23 +000014879 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014880 ctx.arglen = PyTuple_Size(args);
14881 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014882 }
14883 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014884 ctx.arglen = -1;
14885 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014886 }
Victor Stinnera47082312012-10-04 02:19:54 +020014887 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014888 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014889 ctx.dict = args;
14890 else
14891 ctx.dict = NULL;
14892 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014893
Victor Stinnera47082312012-10-04 02:19:54 +020014894 while (--ctx.fmtcnt >= 0) {
14895 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014896 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014897
14898 nonfmtpos = ctx.fmtpos++;
14899 while (ctx.fmtcnt >= 0 &&
14900 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14901 ctx.fmtpos++;
14902 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014903 }
Victor Stinnera47082312012-10-04 02:19:54 +020014904 if (ctx.fmtcnt < 0) {
14905 ctx.fmtpos--;
14906 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014907 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014908
Victor Stinnercfc4c132013-04-03 01:48:39 +020014909 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14910 nonfmtpos, ctx.fmtpos) < 0)
14911 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014912 }
14913 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014914 ctx.fmtpos++;
14915 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014916 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014917 }
14918 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014919
Victor Stinnera47082312012-10-04 02:19:54 +020014920 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014921 PyErr_SetString(PyExc_TypeError,
14922 "not all arguments converted during string formatting");
14923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014924 }
14925
Victor Stinnera47082312012-10-04 02:19:54 +020014926 if (ctx.args_owned) {
14927 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014928 }
Victor Stinnera47082312012-10-04 02:19:54 +020014929 Py_DECREF(ctx.fmtstr);
14930 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014931
Benjamin Peterson29060642009-01-31 22:14:21 +000014932 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014933 Py_DECREF(ctx.fmtstr);
14934 _PyUnicodeWriter_Dealloc(&ctx.writer);
14935 if (ctx.args_owned) {
14936 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014937 }
14938 return NULL;
14939}
14940
Jeremy Hylton938ace62002-07-17 16:30:39 +000014941static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014942unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14943
Tim Peters6d6c1a32001-08-02 04:15:00 +000014944static PyObject *
14945unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14946{
Benjamin Peterson29060642009-01-31 22:14:21 +000014947 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014948 static char *kwlist[] = {"object", "encoding", "errors", 0};
14949 char *encoding = NULL;
14950 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014951
Benjamin Peterson14339b62009-01-31 16:36:08 +000014952 if (type != &PyUnicode_Type)
14953 return unicode_subtype_new(type, args, kwds);
14954 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014955 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014956 return NULL;
14957 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014958 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014959 if (encoding == NULL && errors == NULL)
14960 return PyObject_Str(x);
14961 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014962 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014963}
14964
Guido van Rossume023fe02001-08-30 03:12:59 +000014965static PyObject *
14966unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14967{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014968 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014969 Py_ssize_t length, char_size;
14970 int share_wstr, share_utf8;
14971 unsigned int kind;
14972 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014973
Benjamin Peterson14339b62009-01-31 16:36:08 +000014974 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014975
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014976 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014977 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014978 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014979 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014980 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014981 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014982 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014983 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014984
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014985 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014986 if (self == NULL) {
14987 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014988 return NULL;
14989 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014990 kind = PyUnicode_KIND(unicode);
14991 length = PyUnicode_GET_LENGTH(unicode);
14992
14993 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014994#ifdef Py_DEBUG
14995 _PyUnicode_HASH(self) = -1;
14996#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014997 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014998#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014999 _PyUnicode_STATE(self).interned = 0;
15000 _PyUnicode_STATE(self).kind = kind;
15001 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015002 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015003 _PyUnicode_STATE(self).ready = 1;
15004 _PyUnicode_WSTR(self) = NULL;
15005 _PyUnicode_UTF8_LENGTH(self) = 0;
15006 _PyUnicode_UTF8(self) = NULL;
15007 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015008 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015009
15010 share_utf8 = 0;
15011 share_wstr = 0;
15012 if (kind == PyUnicode_1BYTE_KIND) {
15013 char_size = 1;
15014 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15015 share_utf8 = 1;
15016 }
15017 else if (kind == PyUnicode_2BYTE_KIND) {
15018 char_size = 2;
15019 if (sizeof(wchar_t) == 2)
15020 share_wstr = 1;
15021 }
15022 else {
15023 assert(kind == PyUnicode_4BYTE_KIND);
15024 char_size = 4;
15025 if (sizeof(wchar_t) == 4)
15026 share_wstr = 1;
15027 }
15028
15029 /* Ensure we won't overflow the length. */
15030 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15031 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015032 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015033 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015034 data = PyObject_MALLOC((length + 1) * char_size);
15035 if (data == NULL) {
15036 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015037 goto onError;
15038 }
15039
Victor Stinnerc3c74152011-10-02 20:39:55 +020015040 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015041 if (share_utf8) {
15042 _PyUnicode_UTF8_LENGTH(self) = length;
15043 _PyUnicode_UTF8(self) = data;
15044 }
15045 if (share_wstr) {
15046 _PyUnicode_WSTR_LENGTH(self) = length;
15047 _PyUnicode_WSTR(self) = (wchar_t *)data;
15048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015049
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015050 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015051 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015052 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015053#ifdef Py_DEBUG
15054 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15055#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015056 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015057 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058
15059onError:
15060 Py_DECREF(unicode);
15061 Py_DECREF(self);
15062 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015063}
15064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015065PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015066"str(object='') -> str\n\
15067str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015068\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015069Create a new string object from the given object. If encoding or\n\
15070errors is specified, then the object must expose a data buffer\n\
15071that will be decoded using the given encoding and error handler.\n\
15072Otherwise, returns the result of object.__str__() (if defined)\n\
15073or repr(object).\n\
15074encoding defaults to sys.getdefaultencoding().\n\
15075errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015076
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015077static PyObject *unicode_iter(PyObject *seq);
15078
Guido van Rossumd57fd912000-03-10 22:53:23 +000015079PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015080 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 "str", /* tp_name */
15082 sizeof(PyUnicodeObject), /* tp_size */
15083 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015084 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 (destructor)unicode_dealloc, /* tp_dealloc */
15086 0, /* tp_print */
15087 0, /* tp_getattr */
15088 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015089 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015090 unicode_repr, /* tp_repr */
15091 &unicode_as_number, /* tp_as_number */
15092 &unicode_as_sequence, /* tp_as_sequence */
15093 &unicode_as_mapping, /* tp_as_mapping */
15094 (hashfunc) unicode_hash, /* tp_hash*/
15095 0, /* tp_call*/
15096 (reprfunc) unicode_str, /* tp_str */
15097 PyObject_GenericGetAttr, /* tp_getattro */
15098 0, /* tp_setattro */
15099 0, /* tp_as_buffer */
15100 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015101 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015102 unicode_doc, /* tp_doc */
15103 0, /* tp_traverse */
15104 0, /* tp_clear */
15105 PyUnicode_RichCompare, /* tp_richcompare */
15106 0, /* tp_weaklistoffset */
15107 unicode_iter, /* tp_iter */
15108 0, /* tp_iternext */
15109 unicode_methods, /* tp_methods */
15110 0, /* tp_members */
15111 0, /* tp_getset */
15112 &PyBaseObject_Type, /* tp_base */
15113 0, /* tp_dict */
15114 0, /* tp_descr_get */
15115 0, /* tp_descr_set */
15116 0, /* tp_dictoffset */
15117 0, /* tp_init */
15118 0, /* tp_alloc */
15119 unicode_new, /* tp_new */
15120 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015121};
15122
15123/* Initialize the Unicode implementation */
15124
Victor Stinner3a50e702011-10-18 21:21:00 +020015125int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015126{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015127 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015128 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015129 0x000A, /* LINE FEED */
15130 0x000D, /* CARRIAGE RETURN */
15131 0x001C, /* FILE SEPARATOR */
15132 0x001D, /* GROUP SEPARATOR */
15133 0x001E, /* RECORD SEPARATOR */
15134 0x0085, /* NEXT LINE */
15135 0x2028, /* LINE SEPARATOR */
15136 0x2029, /* PARAGRAPH SEPARATOR */
15137 };
15138
Fred Drakee4315f52000-05-09 19:53:39 +000015139 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015140 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015141 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015143 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015144
Guido van Rossumcacfc072002-05-24 19:01:59 +000015145 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015146 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015147
15148 /* initialize the linebreak bloom filter */
15149 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015150 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015151 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015152
Christian Heimes26532f72013-07-20 14:57:16 +020015153 if (PyType_Ready(&EncodingMapType) < 0)
15154 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015155
Benjamin Petersonc4311282012-10-30 23:21:10 -040015156 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15157 Py_FatalError("Can't initialize field name iterator type");
15158
15159 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15160 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015161
Victor Stinner3a50e702011-10-18 21:21:00 +020015162 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015163}
15164
15165/* Finalize the Unicode implementation */
15166
Christian Heimesa156e092008-02-16 07:38:31 +000015167int
15168PyUnicode_ClearFreeList(void)
15169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015170 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015171}
15172
Guido van Rossumd57fd912000-03-10 22:53:23 +000015173void
Thomas Wouters78890102000-07-22 19:25:51 +000015174_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015175{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015176 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015177
Serhiy Storchaka05997252013-01-26 12:14:02 +020015178 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015179
Serhiy Storchaka05997252013-01-26 12:14:02 +020015180 for (i = 0; i < 256; i++)
15181 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015182 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015183 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015184}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015185
Walter Dörwald16807132007-05-25 13:52:07 +000015186void
15187PyUnicode_InternInPlace(PyObject **p)
15188{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015189 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015190 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015191#ifdef Py_DEBUG
15192 assert(s != NULL);
15193 assert(_PyUnicode_CHECK(s));
15194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015195 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015196 return;
15197#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015198 /* If it's a subclass, we don't really know what putting
15199 it in the interned dict might do. */
15200 if (!PyUnicode_CheckExact(s))
15201 return;
15202 if (PyUnicode_CHECK_INTERNED(s))
15203 return;
15204 if (interned == NULL) {
15205 interned = PyDict_New();
15206 if (interned == NULL) {
15207 PyErr_Clear(); /* Don't leave an exception */
15208 return;
15209 }
15210 }
15211 /* It might be that the GetItem call fails even
15212 though the key is present in the dictionary,
15213 namely when this happens during a stack overflow. */
15214 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015215 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015217
Victor Stinnerf0335102013-04-14 19:13:03 +020015218 if (t) {
15219 Py_INCREF(t);
15220 Py_DECREF(*p);
15221 *p = t;
15222 return;
15223 }
Walter Dörwald16807132007-05-25 13:52:07 +000015224
Benjamin Peterson14339b62009-01-31 16:36:08 +000015225 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015226 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 PyErr_Clear();
15228 PyThreadState_GET()->recursion_critical = 0;
15229 return;
15230 }
15231 PyThreadState_GET()->recursion_critical = 0;
15232 /* The two references in interned are not counted by refcnt.
15233 The deallocator will take care of this */
15234 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015235 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015236}
15237
15238void
15239PyUnicode_InternImmortal(PyObject **p)
15240{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015241 PyUnicode_InternInPlace(p);
15242 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015243 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015244 Py_INCREF(*p);
15245 }
Walter Dörwald16807132007-05-25 13:52:07 +000015246}
15247
15248PyObject *
15249PyUnicode_InternFromString(const char *cp)
15250{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 PyObject *s = PyUnicode_FromString(cp);
15252 if (s == NULL)
15253 return NULL;
15254 PyUnicode_InternInPlace(&s);
15255 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015256}
15257
Alexander Belopolsky40018472011-02-26 01:02:56 +000015258void
15259_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015260{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015262 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 Py_ssize_t i, n;
15264 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015265
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 if (interned == NULL || !PyDict_Check(interned))
15267 return;
15268 keys = PyDict_Keys(interned);
15269 if (keys == NULL || !PyList_Check(keys)) {
15270 PyErr_Clear();
15271 return;
15272 }
Walter Dörwald16807132007-05-25 13:52:07 +000015273
Benjamin Peterson14339b62009-01-31 16:36:08 +000015274 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15275 detector, interned unicode strings are not forcibly deallocated;
15276 rather, we give them their stolen references back, and then clear
15277 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015278
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 n = PyList_GET_SIZE(keys);
15280 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015281 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015283 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015284 if (PyUnicode_READY(s) == -1) {
15285 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015286 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015288 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 case SSTATE_NOT_INTERNED:
15290 /* XXX Shouldn't happen */
15291 break;
15292 case SSTATE_INTERNED_IMMORTAL:
15293 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015294 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 break;
15296 case SSTATE_INTERNED_MORTAL:
15297 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015298 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015299 break;
15300 default:
15301 Py_FatalError("Inconsistent interned string state.");
15302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015303 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 }
15305 fprintf(stderr, "total size of all interned strings: "
15306 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15307 "mortal/immortal\n", mortal_size, immortal_size);
15308 Py_DECREF(keys);
15309 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015310 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015311}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015312
15313
15314/********************* Unicode Iterator **************************/
15315
15316typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 PyObject_HEAD
15318 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015319 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015320} unicodeiterobject;
15321
15322static void
15323unicodeiter_dealloc(unicodeiterobject *it)
15324{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015325 _PyObject_GC_UNTRACK(it);
15326 Py_XDECREF(it->it_seq);
15327 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015328}
15329
15330static int
15331unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015333 Py_VISIT(it->it_seq);
15334 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015335}
15336
15337static PyObject *
15338unicodeiter_next(unicodeiterobject *it)
15339{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015340 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015341
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 assert(it != NULL);
15343 seq = it->it_seq;
15344 if (seq == NULL)
15345 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015346 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015348 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15349 int kind = PyUnicode_KIND(seq);
15350 void *data = PyUnicode_DATA(seq);
15351 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15352 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 if (item != NULL)
15354 ++it->it_index;
15355 return item;
15356 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015357
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 Py_DECREF(seq);
15359 it->it_seq = NULL;
15360 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015361}
15362
15363static PyObject *
15364unicodeiter_len(unicodeiterobject *it)
15365{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 Py_ssize_t len = 0;
15367 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015368 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015370}
15371
15372PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15373
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015374static PyObject *
15375unicodeiter_reduce(unicodeiterobject *it)
15376{
15377 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015378 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015379 it->it_seq, it->it_index);
15380 } else {
15381 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15382 if (u == NULL)
15383 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015384 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015385 }
15386}
15387
15388PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15389
15390static PyObject *
15391unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15392{
15393 Py_ssize_t index = PyLong_AsSsize_t(state);
15394 if (index == -1 && PyErr_Occurred())
15395 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015396 if (it->it_seq != NULL) {
15397 if (index < 0)
15398 index = 0;
15399 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15400 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15401 it->it_index = index;
15402 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015403 Py_RETURN_NONE;
15404}
15405
15406PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15407
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015408static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015410 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015411 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15412 reduce_doc},
15413 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15414 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015416};
15417
15418PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015419 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15420 "str_iterator", /* tp_name */
15421 sizeof(unicodeiterobject), /* tp_basicsize */
15422 0, /* tp_itemsize */
15423 /* methods */
15424 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15425 0, /* tp_print */
15426 0, /* tp_getattr */
15427 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015428 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015429 0, /* tp_repr */
15430 0, /* tp_as_number */
15431 0, /* tp_as_sequence */
15432 0, /* tp_as_mapping */
15433 0, /* tp_hash */
15434 0, /* tp_call */
15435 0, /* tp_str */
15436 PyObject_GenericGetAttr, /* tp_getattro */
15437 0, /* tp_setattro */
15438 0, /* tp_as_buffer */
15439 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15440 0, /* tp_doc */
15441 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15442 0, /* tp_clear */
15443 0, /* tp_richcompare */
15444 0, /* tp_weaklistoffset */
15445 PyObject_SelfIter, /* tp_iter */
15446 (iternextfunc)unicodeiter_next, /* tp_iternext */
15447 unicodeiter_methods, /* tp_methods */
15448 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015449};
15450
15451static PyObject *
15452unicode_iter(PyObject *seq)
15453{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015454 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015455
Benjamin Peterson14339b62009-01-31 16:36:08 +000015456 if (!PyUnicode_Check(seq)) {
15457 PyErr_BadInternalCall();
15458 return NULL;
15459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015460 if (PyUnicode_READY(seq) == -1)
15461 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15463 if (it == NULL)
15464 return NULL;
15465 it->it_index = 0;
15466 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015467 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 _PyObject_GC_TRACK(it);
15469 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015470}
15471
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015472
15473size_t
15474Py_UNICODE_strlen(const Py_UNICODE *u)
15475{
15476 int res = 0;
15477 while(*u++)
15478 res++;
15479 return res;
15480}
15481
15482Py_UNICODE*
15483Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15484{
15485 Py_UNICODE *u = s1;
15486 while ((*u++ = *s2++));
15487 return s1;
15488}
15489
15490Py_UNICODE*
15491Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15492{
15493 Py_UNICODE *u = s1;
15494 while ((*u++ = *s2++))
15495 if (n-- == 0)
15496 break;
15497 return s1;
15498}
15499
15500Py_UNICODE*
15501Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15502{
15503 Py_UNICODE *u1 = s1;
15504 u1 += Py_UNICODE_strlen(u1);
15505 Py_UNICODE_strcpy(u1, s2);
15506 return s1;
15507}
15508
15509int
15510Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15511{
15512 while (*s1 && *s2 && *s1 == *s2)
15513 s1++, s2++;
15514 if (*s1 && *s2)
15515 return (*s1 < *s2) ? -1 : +1;
15516 if (*s1)
15517 return 1;
15518 if (*s2)
15519 return -1;
15520 return 0;
15521}
15522
15523int
15524Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15525{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015526 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015527 for (; n != 0; n--) {
15528 u1 = *s1;
15529 u2 = *s2;
15530 if (u1 != u2)
15531 return (u1 < u2) ? -1 : +1;
15532 if (u1 == '\0')
15533 return 0;
15534 s1++;
15535 s2++;
15536 }
15537 return 0;
15538}
15539
15540Py_UNICODE*
15541Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15542{
15543 const Py_UNICODE *p;
15544 for (p = s; *p; p++)
15545 if (*p == c)
15546 return (Py_UNICODE*)p;
15547 return NULL;
15548}
15549
15550Py_UNICODE*
15551Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15552{
15553 const Py_UNICODE *p;
15554 p = s + Py_UNICODE_strlen(s);
15555 while (p != s) {
15556 p--;
15557 if (*p == c)
15558 return (Py_UNICODE*)p;
15559 }
15560 return NULL;
15561}
Victor Stinner331ea922010-08-10 16:37:20 +000015562
Victor Stinner71133ff2010-09-01 23:43:53 +000015563Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015564PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015565{
Victor Stinner577db2c2011-10-11 22:12:48 +020015566 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015567 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015569 if (!PyUnicode_Check(unicode)) {
15570 PyErr_BadArgument();
15571 return NULL;
15572 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015573 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015574 if (u == NULL)
15575 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015576 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015577 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015578 PyErr_NoMemory();
15579 return NULL;
15580 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015581 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015582 size *= sizeof(Py_UNICODE);
15583 copy = PyMem_Malloc(size);
15584 if (copy == NULL) {
15585 PyErr_NoMemory();
15586 return NULL;
15587 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015588 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015589 return copy;
15590}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015591
Georg Brandl66c221e2010-10-14 07:04:07 +000015592/* A _string module, to export formatter_parser and formatter_field_name_split
15593 to the string.Formatter class implemented in Python. */
15594
15595static PyMethodDef _string_methods[] = {
15596 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15597 METH_O, PyDoc_STR("split the argument as a field name")},
15598 {"formatter_parser", (PyCFunction) formatter_parser,
15599 METH_O, PyDoc_STR("parse the argument as a format string")},
15600 {NULL, NULL}
15601};
15602
15603static struct PyModuleDef _string_module = {
15604 PyModuleDef_HEAD_INIT,
15605 "_string",
15606 PyDoc_STR("string helper module"),
15607 0,
15608 _string_methods,
15609 NULL,
15610 NULL,
15611 NULL,
15612 NULL
15613};
15614
15615PyMODINIT_FUNC
15616PyInit__string(void)
15617{
15618 return PyModule_Create(&_string_module);
15619}
15620
15621
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015622#ifdef __cplusplus
15623}
15624#endif