blob: b96333ce472622816a7cb4764e118673444af2bc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Alexander Belopolsky40018472011-02-26 01:02:56 +0000723Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200829Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200830 Py_ssize_t size, Py_UCS4 ch,
831 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200833 switch (kind) {
834 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200835 if ((Py_UCS1) ch != ch)
836 return -1;
837 if (direction > 0)
838 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
839 else
840 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if ((Py_UCS2) ch != ch)
843 return -1;
844 if (direction > 0)
845 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
846 else
847 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200848 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200849 if (direction > 0)
850 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
851 else
852 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200853 default:
854 assert(0);
855 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857}
858
Victor Stinnerafffce42012-10-03 23:03:17 +0200859#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000860/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200861 earlier.
862
863 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
864 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
865 invalid character in Unicode 6.0. */
866static void
867unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
868{
869 int kind = PyUnicode_KIND(unicode);
870 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
871 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
872 if (length <= old_length)
873 return;
874 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
875}
876#endif
877
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878static PyObject*
879resize_compact(PyObject *unicode, Py_ssize_t length)
880{
881 Py_ssize_t char_size;
882 Py_ssize_t struct_size;
883 Py_ssize_t new_size;
884 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100885 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200886#ifdef Py_DEBUG
887 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
888#endif
889
Victor Stinner79891572012-05-03 13:43:07 +0200890 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200891 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100892 assert(PyUnicode_IS_COMPACT(unicode));
893
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200894 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100895 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200896 struct_size = sizeof(PyASCIIObject);
897 else
898 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200899 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
902 PyErr_NoMemory();
903 return NULL;
904 }
905 new_size = (struct_size + (length + 1) * char_size);
906
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200907 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_UTF8(unicode));
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911 }
Victor Stinner84def372011-12-11 20:04:56 +0100912 _Py_DEC_REFTOTAL;
913 _Py_ForgetReference(unicode);
914
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300915 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100916 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 PyErr_NoMemory();
919 return NULL;
920 }
Victor Stinner84def372011-12-11 20:04:56 +0100921 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200922 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100923
Victor Stinnerfe226c02011-10-03 03:52:20 +0200924 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200925 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100927 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200928 _PyUnicode_WSTR_LENGTH(unicode) = length;
929 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100930 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
931 PyObject_DEL(_PyUnicode_WSTR(unicode));
932 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100933 if (!PyUnicode_IS_ASCII(unicode))
934 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100935 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
937 unicode_fill_invalid(unicode, old_length);
938#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
940 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200941 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200942 return unicode;
943}
944
Alexander Belopolsky40018472011-02-26 01:02:56 +0000945static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200946resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947{
Victor Stinner95663112011-10-04 01:03:50 +0200948 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100949 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200951 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000952
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953 if (PyUnicode_IS_READY(unicode)) {
954 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200955 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200957#ifdef Py_DEBUG
958 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
959#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960
961 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200962 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200963 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
964 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965
966 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
967 PyErr_NoMemory();
968 return -1;
969 }
970 new_size = (length + 1) * char_size;
971
Victor Stinner7a9105a2011-12-12 00:13:42 +0100972 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
973 {
974 PyObject_DEL(_PyUnicode_UTF8(unicode));
975 _PyUnicode_UTF8(unicode) = NULL;
976 _PyUnicode_UTF8_LENGTH(unicode) = 0;
977 }
978
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 data = (PyObject *)PyObject_REALLOC(data, new_size);
980 if (data == NULL) {
981 PyErr_NoMemory();
982 return -1;
983 }
984 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200985 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200987 _PyUnicode_WSTR_LENGTH(unicode) = length;
988 }
989 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200990 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200991 _PyUnicode_UTF8_LENGTH(unicode) = length;
992 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 _PyUnicode_LENGTH(unicode) = length;
994 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200995#ifdef Py_DEBUG
996 unicode_fill_invalid(unicode, old_length);
997#endif
Victor Stinner95663112011-10-04 01:03:50 +0200998 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200999 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinner95663112011-10-04 01:03:50 +02001003 assert(_PyUnicode_WSTR(unicode) != NULL);
1004
1005 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001006 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001007 PyErr_NoMemory();
1008 return -1;
1009 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001010 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001011 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001012 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001013 if (!wstr) {
1014 PyErr_NoMemory();
1015 return -1;
1016 }
1017 _PyUnicode_WSTR(unicode) = wstr;
1018 _PyUnicode_WSTR(unicode)[length] = 0;
1019 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001020 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 return 0;
1022}
1023
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024static PyObject*
1025resize_copy(PyObject *unicode, Py_ssize_t length)
1026{
1027 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030
Benjamin Petersonbac79492012-01-14 13:34:47 -05001031 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001032 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033
1034 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1035 if (copy == NULL)
1036 return NULL;
1037
1038 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001039 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001041 }
1042 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001043 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 if (w == NULL)
1047 return NULL;
1048 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1049 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001050 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1051 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 }
1054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001057 Ux0000 terminated; some code (e.g. new_identifier)
1058 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001061 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062
1063*/
1064
Alexander Belopolsky40018472011-02-26 01:02:56 +00001065static PyUnicodeObject *
1066_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001068 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (length == 0 && unicode_empty != NULL) {
1073 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001074 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 }
1076
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001077 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001078 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001079 return (PyUnicodeObject *)PyErr_NoMemory();
1080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 if (length < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to _PyUnicode_New");
1084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 }
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1088 if (unicode == NULL)
1089 return NULL;
1090 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001091
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 _PyUnicode_HASH(unicode) = -1;
1094 _PyUnicode_STATE(unicode).interned = 0;
1095 _PyUnicode_STATE(unicode).kind = 0;
1096 _PyUnicode_STATE(unicode).compact = 0;
1097 _PyUnicode_STATE(unicode).ready = 0;
1098 _PyUnicode_STATE(unicode).ascii = 0;
1099 _PyUnicode_DATA_ANY(unicode) = NULL;
1100 _PyUnicode_LENGTH(unicode) = 0;
1101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1105 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001106 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001108 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110
Jeremy Hyltond8082792003-09-16 19:41:39 +00001111 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001112 * the caller fails before initializing str -- unicode_resize()
1113 * reads str[0], and the Keep-Alive optimization can keep memory
1114 * allocated for str alive across a call to unicode_dealloc(unicode).
1115 * We don't want unicode_resize to read uninitialized memory in
1116 * that case.
1117 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 _PyUnicode_WSTR(unicode)[0] = 0;
1119 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001120
Victor Stinner7931d9a2011-11-04 00:22:48 +01001121 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 return unicode;
1123}
1124
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125static const char*
1126unicode_kind_name(PyObject *unicode)
1127{
Victor Stinner42dfd712011-10-03 14:41:45 +02001128 /* don't check consistency: unicode_kind_name() is called from
1129 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130 if (!PyUnicode_IS_COMPACT(unicode))
1131 {
1132 if (!PyUnicode_IS_READY(unicode))
1133 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001134 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 {
1136 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001137 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001138 return "legacy ascii";
1139 else
1140 return "legacy latin1";
1141 case PyUnicode_2BYTE_KIND:
1142 return "legacy UCS2";
1143 case PyUnicode_4BYTE_KIND:
1144 return "legacy UCS4";
1145 default:
1146 return "<legacy invalid kind>";
1147 }
1148 }
1149 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 return "ascii";
1154 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001155 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 default:
1161 return "<invalid compact kind>";
1162 }
1163}
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166/* Functions wrapping macros for use in debugger */
1167char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001168 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169}
1170
1171void *_PyUnicode_compact_data(void *unicode) {
1172 return _PyUnicode_COMPACT_DATA(unicode);
1173}
1174void *_PyUnicode_data(void *unicode){
1175 printf("obj %p\n", unicode);
1176 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1177 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1178 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1179 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1180 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1181 return PyUnicode_DATA(unicode);
1182}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001183
1184void
1185_PyUnicode_Dump(PyObject *op)
1186{
1187 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001188 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1189 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1190 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001191
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001193 {
1194 if (ascii->state.ascii)
1195 data = (ascii + 1);
1196 else
1197 data = (compact + 1);
1198 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 else
1200 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001201 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1202 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001203
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 if (ascii->wstr == data)
1205 printf("shared ");
1206 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera3b334d2011-10-03 13:53:37 +02001208 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001209 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1211 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001212 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1213 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001216}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217#endif
1218
1219PyObject *
1220PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1221{
1222 PyObject *obj;
1223 PyCompactUnicodeObject *unicode;
1224 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001225 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001226 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 Py_ssize_t char_size;
1228 Py_ssize_t struct_size;
1229
1230 /* Optimization for empty strings */
1231 if (size == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 }
1235
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 is_ascii = 0;
1237 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 struct_size = sizeof(PyCompactUnicodeObject);
1239 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001240 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 char_size = 1;
1242 is_ascii = 1;
1243 struct_size = sizeof(PyASCIIObject);
1244 }
1245 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 }
1249 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 2;
1252 if (sizeof(wchar_t) == 2)
1253 is_sharing = 1;
1254 }
1255 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001256 if (maxchar > MAX_UNICODE) {
1257 PyErr_SetString(PyExc_SystemError,
1258 "invalid maximum character passed to PyUnicode_New");
1259 return NULL;
1260 }
Victor Stinner8f825062012-04-27 13:55:39 +02001261 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 char_size = 4;
1263 if (sizeof(wchar_t) == 4)
1264 is_sharing = 1;
1265 }
1266
1267 /* Ensure we won't overflow the size. */
1268 if (size < 0) {
1269 PyErr_SetString(PyExc_SystemError,
1270 "Negative size passed to PyUnicode_New");
1271 return NULL;
1272 }
1273 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1274 return PyErr_NoMemory();
1275
1276 /* Duplicated allocation code from _PyObject_New() instead of a call to
1277 * PyObject_New() so we are able to allocate space for the object and
1278 * it's data buffer.
1279 */
1280 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1281 if (obj == NULL)
1282 return PyErr_NoMemory();
1283 obj = PyObject_INIT(obj, &PyUnicode_Type);
1284 if (obj == NULL)
1285 return NULL;
1286
1287 unicode = (PyCompactUnicodeObject *)obj;
1288 if (is_ascii)
1289 data = ((PyASCIIObject*)obj) + 1;
1290 else
1291 data = unicode + 1;
1292 _PyUnicode_LENGTH(unicode) = size;
1293 _PyUnicode_HASH(unicode) = -1;
1294 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001295 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 _PyUnicode_STATE(unicode).compact = 1;
1297 _PyUnicode_STATE(unicode).ready = 1;
1298 _PyUnicode_STATE(unicode).ascii = is_ascii;
1299 if (is_ascii) {
1300 ((char*)data)[size] = 0;
1301 _PyUnicode_WSTR(unicode) = NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 ((char*)data)[size] = 0;
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001308 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 else {
1311 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001312 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001313 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ((Py_UCS4*)data)[size] = 0;
1317 if (is_sharing) {
1318 _PyUnicode_WSTR_LENGTH(unicode) = size;
1319 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1320 }
1321 else {
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 _PyUnicode_WSTR(unicode) = NULL;
1324 }
1325 }
Victor Stinner8f825062012-04-27 13:55:39 +02001326#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001327 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001328#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001329 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return obj;
1331}
1332
1333#if SIZEOF_WCHAR_T == 2
1334/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1335 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001336 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
1338 This function assumes that unicode can hold one more code point than wstr
1339 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001340static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001342 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343{
1344 const wchar_t *iter;
1345 Py_UCS4 *ucs4_out;
1346
Victor Stinner910337b2011-10-03 03:20:16 +02001347 assert(unicode != NULL);
1348 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1350 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1351
1352 for (iter = begin; iter < end; ) {
1353 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1354 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001355 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1356 && (iter+1) < end
1357 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 {
Victor Stinner551ac952011-11-29 22:58:13 +01001359 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 iter += 2;
1361 }
1362 else {
1363 *ucs4_out++ = *iter;
1364 iter++;
1365 }
1366 }
1367 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1368 _PyUnicode_GET_LENGTH(unicode)));
1369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370}
1371#endif
1372
Victor Stinnercd9950f2011-10-02 00:34:53 +02001373static int
Victor Stinner488fa492011-12-12 00:01:39 +01001374unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001375{
Victor Stinner488fa492011-12-12 00:01:39 +01001376 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001377 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001378 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379 return -1;
1380 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381 return 0;
1382}
1383
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001384static int
1385_copy_characters(PyObject *to, Py_ssize_t to_start,
1386 PyObject *from, Py_ssize_t from_start,
1387 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 unsigned int from_kind, to_kind;
1390 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinneree4544c2012-05-09 22:24:08 +02001392 assert(0 <= how_many);
1393 assert(0 <= from_start);
1394 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinnerd3f08822012-05-29 12:57:52 +02001399 assert(PyUnicode_Check(to));
1400 assert(PyUnicode_IS_READY(to));
1401 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1402
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001403 if (how_many == 0)
1404 return 0;
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001407 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001409 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerf1852262012-06-16 16:38:26 +02001411#ifdef Py_DEBUG
1412 if (!check_maxchar
1413 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1414 {
1415 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1416 Py_UCS4 ch;
1417 Py_ssize_t i;
1418 for (i=0; i < how_many; i++) {
1419 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1420 assert(ch <= to_maxchar);
1421 }
1422 }
1423#endif
1424
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001425 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001426 if (check_maxchar
1427 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1428 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 /* Writing Latin-1 characters into an ASCII string requires to
1430 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 Py_UCS4 max_char;
1432 max_char = ucs1lib_find_max_char(from_data,
1433 (Py_UCS1*)from_data + how_many);
1434 if (max_char >= 128)
1435 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001437 Py_MEMCPY((char*)to_data + to_kind * to_start,
1438 (char*)from_data + from_kind * from_start,
1439 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 else if (from_kind == PyUnicode_1BYTE_KIND
1442 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 {
1444 _PyUnicode_CONVERT_BYTES(
1445 Py_UCS1, Py_UCS2,
1446 PyUnicode_1BYTE_DATA(from) + from_start,
1447 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448 PyUnicode_2BYTE_DATA(to) + to_start
1449 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001452 && to_kind == PyUnicode_4BYTE_KIND)
1453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS4,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_4BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
1461 else if (from_kind == PyUnicode_2BYTE_KIND
1462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS2, Py_UCS4,
1466 PyUnicode_2BYTE_DATA(from) + from_start,
1467 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001471 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001472 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1473
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001474 if (!check_maxchar) {
1475 if (from_kind == PyUnicode_2BYTE_KIND
1476 && to_kind == PyUnicode_1BYTE_KIND)
1477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS2, Py_UCS1,
1480 PyUnicode_2BYTE_DATA(from) + from_start,
1481 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_1BYTE_DATA(to) + to_start
1483 );
1484 }
1485 else if (from_kind == PyUnicode_4BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS4, Py_UCS1,
1490 PyUnicode_4BYTE_DATA(from) + from_start,
1491 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_2BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS2,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_2BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else {
1506 assert(0);
1507 return -1;
1508 }
1509 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001511 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 Py_ssize_t i;
1514
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 for (i=0; i < how_many; i++) {
1516 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 if (ch > to_maxchar)
1518 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1520 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 }
1522 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523 return 0;
1524}
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526void
1527_PyUnicode_FastCopyCharacters(
1528 PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530{
1531 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1532}
1533
1534Py_ssize_t
1535PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1536 PyObject *from, Py_ssize_t from_start,
1537 Py_ssize_t how_many)
1538{
1539 int err;
1540
1541 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1542 PyErr_BadInternalCall();
1543 return -1;
1544 }
1545
Benjamin Petersonbac79492012-01-14 13:34:47 -05001546 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001548 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001549 return -1;
1550
Victor Stinnerd3f08822012-05-29 12:57:52 +02001551 if (from_start < 0) {
1552 PyErr_SetString(PyExc_IndexError, "string index out of range");
1553 return -1;
1554 }
1555 if (to_start < 0) {
1556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1560 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1561 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001562 "Cannot write %zi characters at %zi "
1563 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 how_many, to_start, PyUnicode_GET_LENGTH(to));
1565 return -1;
1566 }
1567
1568 if (how_many == 0)
1569 return 0;
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001572 return -1;
1573
1574 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1575 if (err) {
1576 PyErr_Format(PyExc_SystemError,
1577 "Cannot copy %s characters "
1578 "into a string of %s characters",
1579 unicode_kind_name(from),
1580 unicode_kind_name(to));
1581 return -1;
1582 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001583 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584}
1585
Victor Stinner17222162011-09-28 22:15:37 +02001586/* Find the maximum code point and count the number of surrogate pairs so a
1587 correct string length can be computed before converting a string to UCS4.
1588 This function counts single surrogates as a character and not as a pair.
1589
1590 Return 0 on success, or -1 on error. */
1591static int
1592find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1593 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594{
1595 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001596 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerc53be962011-10-02 21:33:54 +02001598 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 *num_surrogates = 0;
1600 *maxchar = 0;
1601
1602 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001604 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1605 && (iter+1) < end
1606 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1607 {
1608 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1609 ++(*num_surrogates);
1610 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 }
1612 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001614 {
1615 ch = *iter;
1616 iter++;
1617 }
1618 if (ch > *maxchar) {
1619 *maxchar = ch;
1620 if (*maxchar > MAX_UNICODE) {
1621 PyErr_Format(PyExc_ValueError,
1622 "character U+%x is not in range [U+0000; U+10ffff]",
1623 ch);
1624 return -1;
1625 }
1626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 }
1628 return 0;
1629}
1630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001631int
1632_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633{
1634 wchar_t *end;
1635 Py_UCS4 maxchar = 0;
1636 Py_ssize_t num_surrogates;
1637#if SIZEOF_WCHAR_T == 2
1638 Py_ssize_t length_wo_surrogates;
1639#endif
1640
Georg Brandl7597add2011-10-05 16:36:47 +02001641 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001642 strings were created using _PyObject_New() and where no canonical
1643 representation (the str field) has been set yet aka strings
1644 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001645 assert(_PyUnicode_CHECK(unicode));
1646 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001648 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001649 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001650 /* Actually, it should neither be interned nor be anything else: */
1651 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001654 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
1658 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001659 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1660 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 PyErr_NoMemory();
1662 return -1;
1663 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001664 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 _PyUnicode_WSTR(unicode), end,
1666 PyUnicode_1BYTE_DATA(unicode));
1667 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1668 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1669 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1670 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001671 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001672 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001673 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 }
1675 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001677 _PyUnicode_UTF8(unicode) = NULL;
1678 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 PyObject_FREE(_PyUnicode_WSTR(unicode));
1681 _PyUnicode_WSTR(unicode) = NULL;
1682 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1683 }
1684 /* In this case we might have to convert down from 4-byte native
1685 wchar_t to 2-byte unicode. */
1686 else if (maxchar < 65536) {
1687 assert(num_surrogates == 0 &&
1688 "FindMaxCharAndNumSurrogatePairs() messed up");
1689
Victor Stinner506f5922011-09-28 22:34:18 +02001690#if SIZEOF_WCHAR_T == 2
1691 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001693 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1694 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1695 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001696 _PyUnicode_UTF8(unicode) = NULL;
1697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001698#else
1699 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001700 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001701 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001702 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001703 PyErr_NoMemory();
1704 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 }
Victor Stinner506f5922011-09-28 22:34:18 +02001706 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1707 _PyUnicode_WSTR(unicode), end,
1708 PyUnicode_2BYTE_DATA(unicode));
1709 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1710 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1711 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001712 _PyUnicode_UTF8(unicode) = NULL;
1713 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001714 PyObject_FREE(_PyUnicode_WSTR(unicode));
1715 _PyUnicode_WSTR(unicode) = NULL;
1716 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1717#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1720 else {
1721#if SIZEOF_WCHAR_T == 2
1722 /* in case the native representation is 2-bytes, we need to allocate a
1723 new normalized 4-byte version. */
1724 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001725 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1726 PyErr_NoMemory();
1727 return -1;
1728 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001729 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1730 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 PyErr_NoMemory();
1732 return -1;
1733 }
1734 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1735 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001736 _PyUnicode_UTF8(unicode) = NULL;
1737 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001738 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1739 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001740 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyObject_FREE(_PyUnicode_WSTR(unicode));
1742 _PyUnicode_WSTR(unicode) = NULL;
1743 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1744#else
1745 assert(num_surrogates == 0);
1746
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001749 _PyUnicode_UTF8(unicode) = NULL;
1750 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1752#endif
1753 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1754 }
1755 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001756 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return 0;
1758}
1759
Alexander Belopolsky40018472011-02-26 01:02:56 +00001760static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001761unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762{
Walter Dörwald16807132007-05-25 13:52:07 +00001763 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 case SSTATE_NOT_INTERNED:
1765 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001766
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 case SSTATE_INTERNED_MORTAL:
1768 /* revive dead object temporarily for DelItem */
1769 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001770 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 Py_FatalError(
1772 "deletion of interned string failed");
1773 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001774
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 case SSTATE_INTERNED_IMMORTAL:
1776 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 default:
1779 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001780 }
1781
Victor Stinner03490912011-10-03 23:45:12 +02001782 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001784 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001785 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001786 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1787 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001789 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790}
1791
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001792#ifdef Py_DEBUG
1793static int
1794unicode_is_singleton(PyObject *unicode)
1795{
1796 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1797 if (unicode == unicode_empty)
1798 return 1;
1799 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1800 {
1801 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1802 if (ch < 256 && unicode_latin1[ch] == unicode)
1803 return 1;
1804 }
1805 return 0;
1806}
1807#endif
1808
Alexander Belopolsky40018472011-02-26 01:02:56 +00001809static int
Victor Stinner488fa492011-12-12 00:01:39 +01001810unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001811{
Victor Stinner488fa492011-12-12 00:01:39 +01001812 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001813 if (Py_REFCNT(unicode) != 1)
1814 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001815 if (_PyUnicode_HASH(unicode) != -1)
1816 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 if (PyUnicode_CHECK_INTERNED(unicode))
1818 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001819 if (!PyUnicode_CheckExact(unicode))
1820 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001821#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001822 /* singleton refcount is greater than 1 */
1823 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001824#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001825 return 1;
1826}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001827
Victor Stinnerfe226c02011-10-03 03:52:20 +02001828static int
1829unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1830{
1831 PyObject *unicode;
1832 Py_ssize_t old_length;
1833
1834 assert(p_unicode != NULL);
1835 unicode = *p_unicode;
1836
1837 assert(unicode != NULL);
1838 assert(PyUnicode_Check(unicode));
1839 assert(0 <= length);
1840
Victor Stinner910337b2011-10-03 03:20:16 +02001841 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001842 old_length = PyUnicode_WSTR_LENGTH(unicode);
1843 else
1844 old_length = PyUnicode_GET_LENGTH(unicode);
1845 if (old_length == length)
1846 return 0;
1847
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001848 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001849 _Py_INCREF_UNICODE_EMPTY();
1850 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001852 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 return 0;
1854 }
1855
Victor Stinner488fa492011-12-12 00:01:39 +01001856 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001857 PyObject *copy = resize_copy(unicode, length);
1858 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001860 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001862 }
1863
Victor Stinnerfe226c02011-10-03 03:52:20 +02001864 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001865 PyObject *new_unicode = resize_compact(unicode, length);
1866 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001867 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001868 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001870 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001871 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001875PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001876{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *unicode;
1878 if (p_unicode == NULL) {
1879 PyErr_BadInternalCall();
1880 return -1;
1881 }
1882 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001883 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 {
1885 PyErr_BadInternalCall();
1886 return -1;
1887 }
1888 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001889}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001890
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001891/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001892
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001893 WARNING: The function doesn't copy the terminating null character and
1894 doesn't check the maximum character (may write a latin1 character in an
1895 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001896static void
1897unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1898 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001899{
1900 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1901 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001902 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001903
1904 switch (kind) {
1905 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001906 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001907#ifdef Py_DEBUG
1908 if (PyUnicode_IS_ASCII(unicode)) {
1909 Py_UCS4 maxchar = ucs1lib_find_max_char(
1910 (const Py_UCS1*)str,
1911 (const Py_UCS1*)str + len);
1912 assert(maxchar < 128);
1913 }
1914#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001915 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001916 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 }
1918 case PyUnicode_2BYTE_KIND: {
1919 Py_UCS2 *start = (Py_UCS2 *)data + index;
1920 Py_UCS2 *ucs2 = start;
1921 assert(index <= PyUnicode_GET_LENGTH(unicode));
1922
Victor Stinner184252a2012-06-16 02:57:41 +02001923 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001924 *ucs2 = (Py_UCS2)*str;
1925
1926 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001927 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 }
1929 default: {
1930 Py_UCS4 *start = (Py_UCS4 *)data + index;
1931 Py_UCS4 *ucs4 = start;
1932 assert(kind == PyUnicode_4BYTE_KIND);
1933 assert(index <= PyUnicode_GET_LENGTH(unicode));
1934
Victor Stinner184252a2012-06-16 02:57:41 +02001935 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001936 *ucs4 = (Py_UCS4)*str;
1937
1938 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 }
1941}
1942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943static PyObject*
1944get_latin1_char(unsigned char ch)
1945{
Victor Stinnera464fc12011-10-02 20:39:30 +02001946 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001948 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 if (!unicode)
1950 return NULL;
1951 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001952 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 unicode_latin1[ch] = unicode;
1954 }
1955 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001956 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957}
1958
Victor Stinner985a82a2014-01-03 12:53:47 +01001959static PyObject*
1960unicode_char(Py_UCS4 ch)
1961{
1962 PyObject *unicode;
1963
1964 assert(ch <= MAX_UNICODE);
1965
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001966 if (ch < 256)
1967 return get_latin1_char(ch);
1968
Victor Stinner985a82a2014-01-03 12:53:47 +01001969 unicode = PyUnicode_New(1, ch);
1970 if (unicode == NULL)
1971 return NULL;
1972 switch (PyUnicode_KIND(unicode)) {
1973 case PyUnicode_1BYTE_KIND:
1974 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1975 break;
1976 case PyUnicode_2BYTE_KIND:
1977 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1978 break;
1979 default:
1980 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1981 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1982 }
1983 assert(_PyUnicode_CheckConsistency(unicode, 1));
1984 return unicode;
1985}
1986
Alexander Belopolsky40018472011-02-26 01:02:56 +00001987PyObject *
1988PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001990 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 Py_UCS4 maxchar = 0;
1992 Py_ssize_t num_surrogates;
1993
1994 if (u == NULL)
1995 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001997 /* If the Unicode data is known at construction time, we can apply
1998 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002001 if (size == 0)
2002 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 /* Single character Unicode objects in the Latin-1 range are
2005 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002006 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return get_latin1_char((unsigned char)*u);
2008
2009 /* If not empty and not single character, copy the Unicode data
2010 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002011 if (find_maxchar_surrogates(u, u + size,
2012 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014
Victor Stinner8faf8212011-12-08 22:14:11 +01002015 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 if (!unicode)
2017 return NULL;
2018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 switch (PyUnicode_KIND(unicode)) {
2020 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002021 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2023 break;
2024 case PyUnicode_2BYTE_KIND:
2025#if Py_UNICODE_SIZE == 2
2026 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2027#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002028 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2030#endif
2031 break;
2032 case PyUnicode_4BYTE_KIND:
2033#if SIZEOF_WCHAR_T == 2
2034 /* This is the only case which has to process surrogates, thus
2035 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002036 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037#else
2038 assert(num_surrogates == 0);
2039 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2040#endif
2041 break;
2042 default:
2043 assert(0 && "Impossible state");
2044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002046 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047}
2048
Alexander Belopolsky40018472011-02-26 01:02:56 +00002049PyObject *
2050PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002052 if (size < 0) {
2053 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 return NULL;
2056 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002057 if (u != NULL)
2058 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2059 else
2060 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002061}
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063PyObject *
2064PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002065{
2066 size_t size = strlen(u);
2067 if (size > PY_SSIZE_T_MAX) {
2068 PyErr_SetString(PyExc_OverflowError, "input too long");
2069 return NULL;
2070 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002071 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002072}
2073
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002074PyObject *
2075_PyUnicode_FromId(_Py_Identifier *id)
2076{
2077 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002078 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2079 strlen(id->string),
2080 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002081 if (!id->object)
2082 return NULL;
2083 PyUnicode_InternInPlace(&id->object);
2084 assert(!id->next);
2085 id->next = static_strings;
2086 static_strings = id;
2087 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002088 return id->object;
2089}
2090
2091void
2092_PyUnicode_ClearStaticStrings()
2093{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002094 _Py_Identifier *tmp, *s = static_strings;
2095 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002096 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002097 tmp = s->next;
2098 s->next = NULL;
2099 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002100 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002101 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002102}
2103
Benjamin Peterson0df54292012-03-26 14:50:32 -04002104/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002105
Victor Stinnerd3f08822012-05-29 12:57:52 +02002106PyObject*
2107_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002108{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002109 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002110 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002111 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002112#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002113 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002114#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002115 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 }
Victor Stinner785938e2011-12-11 20:09:03 +01002117 unicode = PyUnicode_New(size, 127);
2118 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002119 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002120 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2121 assert(_PyUnicode_CheckConsistency(unicode, 1));
2122 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002123}
2124
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002125static Py_UCS4
2126kind_maxchar_limit(unsigned int kind)
2127{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002128 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002129 case PyUnicode_1BYTE_KIND:
2130 return 0x80;
2131 case PyUnicode_2BYTE_KIND:
2132 return 0x100;
2133 case PyUnicode_4BYTE_KIND:
2134 return 0x10000;
2135 default:
2136 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002137 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002138 }
2139}
2140
Victor Stinnere6abb482012-05-02 01:15:40 +02002141Py_LOCAL_INLINE(Py_UCS4)
2142align_maxchar(Py_UCS4 maxchar)
2143{
2144 if (maxchar <= 127)
2145 return 127;
2146 else if (maxchar <= 255)
2147 return 255;
2148 else if (maxchar <= 65535)
2149 return 65535;
2150 else
2151 return MAX_UNICODE;
2152}
2153
Victor Stinner702c7342011-10-05 13:50:52 +02002154static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002155_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002158 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002159
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002162 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002163 if (size == 1)
2164 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002165
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002166 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002167 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 if (!res)
2169 return NULL;
2170 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002171 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002173}
2174
Victor Stinnere57b1c02011-09-28 22:20:48 +02002175static PyObject*
2176_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177{
2178 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002180
Serhiy Storchaka678db842013-01-26 12:16:36 +02002181 if (size == 0)
2182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002183 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002184 if (size == 1)
2185 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002187 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002188 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 if (!res)
2190 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002193 else {
2194 _PyUnicode_CONVERT_BYTES(
2195 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2196 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002197 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return res;
2199}
2200
Victor Stinnere57b1c02011-09-28 22:20:48 +02002201static PyObject*
2202_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203{
2204 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002206
Serhiy Storchaka678db842013-01-26 12:16:36 +02002207 if (size == 0)
2208 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002209 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002210 if (size == 1)
2211 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002212
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002213 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002214 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 if (!res)
2216 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002217 if (max_char < 256)
2218 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2219 PyUnicode_1BYTE_DATA(res));
2220 else if (max_char < 0x10000)
2221 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2222 PyUnicode_2BYTE_DATA(res));
2223 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002225 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 return res;
2227}
2228
2229PyObject*
2230PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2231{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002232 if (size < 0) {
2233 PyErr_SetString(PyExc_ValueError, "size must be positive");
2234 return NULL;
2235 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002236 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002240 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002242 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 PyErr_SetString(PyExc_SystemError, "invalid kind");
2245 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247}
2248
Victor Stinnerece58de2012-04-23 23:36:38 +02002249Py_UCS4
2250_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2251{
2252 enum PyUnicode_Kind kind;
2253 void *startptr, *endptr;
2254
2255 assert(PyUnicode_IS_READY(unicode));
2256 assert(0 <= start);
2257 assert(end <= PyUnicode_GET_LENGTH(unicode));
2258 assert(start <= end);
2259
2260 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2261 return PyUnicode_MAX_CHAR_VALUE(unicode);
2262
2263 if (start == end)
2264 return 127;
2265
Victor Stinner94d558b2012-04-27 22:26:58 +02002266 if (PyUnicode_IS_ASCII(unicode))
2267 return 127;
2268
Victor Stinnerece58de2012-04-23 23:36:38 +02002269 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002270 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002271 endptr = (char *)startptr + end * kind;
2272 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002273 switch(kind) {
2274 case PyUnicode_1BYTE_KIND:
2275 return ucs1lib_find_max_char(startptr, endptr);
2276 case PyUnicode_2BYTE_KIND:
2277 return ucs2lib_find_max_char(startptr, endptr);
2278 case PyUnicode_4BYTE_KIND:
2279 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002280 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002281 assert(0);
2282 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002283 }
2284}
2285
Victor Stinner25a4b292011-10-06 12:31:55 +02002286/* Ensure that a string uses the most efficient storage, if it is not the
2287 case: create a new string with of the right kind. Write NULL into *p_unicode
2288 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002289static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002290unicode_adjust_maxchar(PyObject **p_unicode)
2291{
2292 PyObject *unicode, *copy;
2293 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002294 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002295 unsigned int kind;
2296
2297 assert(p_unicode != NULL);
2298 unicode = *p_unicode;
2299 assert(PyUnicode_IS_READY(unicode));
2300 if (PyUnicode_IS_ASCII(unicode))
2301 return;
2302
2303 len = PyUnicode_GET_LENGTH(unicode);
2304 kind = PyUnicode_KIND(unicode);
2305 if (kind == PyUnicode_1BYTE_KIND) {
2306 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002307 max_char = ucs1lib_find_max_char(u, u + len);
2308 if (max_char >= 128)
2309 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002310 }
2311 else if (kind == PyUnicode_2BYTE_KIND) {
2312 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs2lib_find_max_char(u, u + len);
2314 if (max_char >= 256)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
2317 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002319 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs4lib_find_max_char(u, u + len);
2321 if (max_char >= 0x10000)
2322 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002323 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002325 if (copy != NULL)
2326 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002327 Py_DECREF(unicode);
2328 *p_unicode = copy;
2329}
2330
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002332_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002333{
Victor Stinner87af4f22011-11-21 23:03:47 +01002334 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002335 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336
Victor Stinner034f6cf2011-09-30 02:26:44 +02002337 if (!PyUnicode_Check(unicode)) {
2338 PyErr_BadInternalCall();
2339 return NULL;
2340 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002341 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002343
Victor Stinner87af4f22011-11-21 23:03:47 +01002344 length = PyUnicode_GET_LENGTH(unicode);
2345 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 if (!copy)
2347 return NULL;
2348 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2349
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2351 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002352 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002353 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002354}
2355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356
Victor Stinnerbc603d12011-10-02 01:00:40 +02002357/* Widen Unicode objects to larger buffers. Don't write terminating null
2358 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359
2360void*
2361_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2362{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002363 Py_ssize_t len;
2364 void *result;
2365 unsigned int skind;
2366
Benjamin Petersonbac79492012-01-14 13:34:47 -05002367 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 return NULL;
2369
2370 len = PyUnicode_GET_LENGTH(s);
2371 skind = PyUnicode_KIND(s);
2372 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002373 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 return NULL;
2375 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002376 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002377 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002378 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002379 if (!result)
2380 return PyErr_NoMemory();
2381 assert(skind == PyUnicode_1BYTE_KIND);
2382 _PyUnicode_CONVERT_BYTES(
2383 Py_UCS1, Py_UCS2,
2384 PyUnicode_1BYTE_DATA(s),
2385 PyUnicode_1BYTE_DATA(s) + len,
2386 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002388 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002389 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002390 if (!result)
2391 return PyErr_NoMemory();
2392 if (skind == PyUnicode_2BYTE_KIND) {
2393 _PyUnicode_CONVERT_BYTES(
2394 Py_UCS2, Py_UCS4,
2395 PyUnicode_2BYTE_DATA(s),
2396 PyUnicode_2BYTE_DATA(s) + len,
2397 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002399 else {
2400 assert(skind == PyUnicode_1BYTE_KIND);
2401 _PyUnicode_CONVERT_BYTES(
2402 Py_UCS1, Py_UCS4,
2403 PyUnicode_1BYTE_DATA(s),
2404 PyUnicode_1BYTE_DATA(s) + len,
2405 result);
2406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002408 default:
2409 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 }
Victor Stinner01698042011-10-04 00:04:26 +02002411 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return NULL;
2413}
2414
2415static Py_UCS4*
2416as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2417 int copy_null)
2418{
2419 int kind;
2420 void *data;
2421 Py_ssize_t len, targetlen;
2422 if (PyUnicode_READY(string) == -1)
2423 return NULL;
2424 kind = PyUnicode_KIND(string);
2425 data = PyUnicode_DATA(string);
2426 len = PyUnicode_GET_LENGTH(string);
2427 targetlen = len;
2428 if (copy_null)
2429 targetlen++;
2430 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002431 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 if (!target) {
2433 PyErr_NoMemory();
2434 return NULL;
2435 }
2436 }
2437 else {
2438 if (targetsize < targetlen) {
2439 PyErr_Format(PyExc_SystemError,
2440 "string is longer than the buffer");
2441 if (copy_null && 0 < targetsize)
2442 target[0] = 0;
2443 return NULL;
2444 }
2445 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002446 if (kind == PyUnicode_1BYTE_KIND) {
2447 Py_UCS1 *start = (Py_UCS1 *) data;
2448 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002450 else if (kind == PyUnicode_2BYTE_KIND) {
2451 Py_UCS2 *start = (Py_UCS2 *) data;
2452 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2453 }
2454 else {
2455 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (copy_null)
2459 target[len] = 0;
2460 return target;
2461}
2462
2463Py_UCS4*
2464PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002467 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 PyErr_BadInternalCall();
2469 return NULL;
2470 }
2471 return as_ucs4(string, target, targetsize, copy_null);
2472}
2473
2474Py_UCS4*
2475PyUnicode_AsUCS4Copy(PyObject *string)
2476{
2477 return as_ucs4(string, NULL, 0, 1);
2478}
2479
2480#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002481
Alexander Belopolsky40018472011-02-26 01:02:56 +00002482PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002483PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002487 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 PyErr_BadInternalCall();
2489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
2491
Martin v. Löwis790465f2008-04-05 20:41:37 +00002492 if (size == -1) {
2493 size = wcslen(w);
2494 }
2495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497}
2498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002500
Victor Stinner15a11362012-10-06 23:48:20 +02002501/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002502 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2503 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2504#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002505
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002506static int
2507unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2508 Py_ssize_t width, Py_ssize_t precision)
2509{
2510 Py_ssize_t length, fill, arglen;
2511 Py_UCS4 maxchar;
2512
2513 if (PyUnicode_READY(str) == -1)
2514 return -1;
2515
2516 length = PyUnicode_GET_LENGTH(str);
2517 if ((precision == -1 || precision >= length)
2518 && width <= length)
2519 return _PyUnicodeWriter_WriteStr(writer, str);
2520
2521 if (precision != -1)
2522 length = Py_MIN(precision, length);
2523
2524 arglen = Py_MAX(length, width);
2525 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2526 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2527 else
2528 maxchar = writer->maxchar;
2529
2530 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2531 return -1;
2532
2533 if (width > length) {
2534 fill = width - length;
2535 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2536 return -1;
2537 writer->pos += fill;
2538 }
2539
2540 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2541 str, 0, length);
2542 writer->pos += length;
2543 return 0;
2544}
2545
2546static int
2547unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2548 Py_ssize_t width, Py_ssize_t precision)
2549{
2550 /* UTF-8 */
2551 Py_ssize_t length;
2552 PyObject *unicode;
2553 int res;
2554
2555 length = strlen(str);
2556 if (precision != -1)
2557 length = Py_MIN(length, precision);
2558 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2559 if (unicode == NULL)
2560 return -1;
2561
2562 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2563 Py_DECREF(unicode);
2564 return res;
2565}
2566
Victor Stinner96865452011-03-01 23:44:09 +00002567static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002568unicode_fromformat_arg(_PyUnicodeWriter *writer,
2569 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002570{
Victor Stinnere215d962012-10-06 23:03:36 +02002571 const char *p;
2572 Py_ssize_t len;
2573 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width;
2575 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 int longflag;
2577 int longlongflag;
2578 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580
2581 p = f;
2582 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002583 zeropad = 0;
2584 if (*f == '0') {
2585 zeropad = 1;
2586 f++;
2587 }
Victor Stinner96865452011-03-01 23:44:09 +00002588
2589 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 width = -1;
2591 if (Py_ISDIGIT((unsigned)*f)) {
2592 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002593 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002596 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002597 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002598 return NULL;
2599 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002601 f++;
2602 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002603 }
2604 precision = -1;
2605 if (*f == '.') {
2606 f++;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 precision = (*f - '0');
2609 f++;
2610 while (Py_ISDIGIT((unsigned)*f)) {
2611 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2612 PyErr_SetString(PyExc_ValueError,
2613 "precision too big");
2614 return NULL;
2615 }
2616 precision = (precision * 10) + (*f - '0');
2617 f++;
2618 }
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620 if (*f == '%') {
2621 /* "%.3%s" => f points to "3" */
2622 f--;
2623 }
2624 }
2625 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002626 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002627 f--;
2628 }
Victor Stinner96865452011-03-01 23:44:09 +00002629
2630 /* Handle %ld, %lu, %lld and %llu. */
2631 longflag = 0;
2632 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002633 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002634 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002635 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002636 longflag = 1;
2637 ++f;
2638 }
Victor Stinner96865452011-03-01 23:44:09 +00002639 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longlongflag = 1;
2642 f += 2;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 }
2645 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002646 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002647 size_tflag = 1;
2648 ++f;
2649 }
Victor Stinnere215d962012-10-06 23:03:36 +02002650
2651 if (f[1] == '\0')
2652 writer->overallocate = 0;
2653
2654 switch (*f) {
2655 case 'c':
2656 {
2657 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002658 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002659 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002660 "character argument not in range(0x110000)");
2661 return NULL;
2662 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002663 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002664 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002665 break;
2666 }
2667
2668 case 'i':
2669 case 'd':
2670 case 'u':
2671 case 'x':
2672 {
2673 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002674 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002676
2677 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002678 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002679 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002680 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002681 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002682 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002683 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002684 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, size_t));
2687 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, unsigned int));
2690 }
2691 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002692 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002693 }
2694 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002695 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002697 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002699 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002700 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002701 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002702 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002703 va_arg(*vargs, Py_ssize_t));
2704 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002705 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002706 va_arg(*vargs, int));
2707 }
2708 assert(len >= 0);
2709
Victor Stinnere215d962012-10-06 23:03:36 +02002710 if (precision < len)
2711 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002712
2713 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2715 return NULL;
2716
Victor Stinnere215d962012-10-06 23:03:36 +02002717 if (width > precision) {
2718 Py_UCS4 fillchar;
2719 fill = width - precision;
2720 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002721 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2722 return NULL;
2723 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002724 }
Victor Stinner15a11362012-10-06 23:48:20 +02002725 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002726 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002727 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2728 return NULL;
2729 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002730 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002731
Victor Stinner4a587072013-11-19 12:54:53 +01002732 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2733 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002734 break;
2735 }
2736
2737 case 'p':
2738 {
2739 char number[MAX_LONG_LONG_CHARS];
2740
2741 len = sprintf(number, "%p", va_arg(*vargs, void*));
2742 assert(len >= 0);
2743
2744 /* %p is ill-defined: ensure leading 0x. */
2745 if (number[1] == 'X')
2746 number[1] = 'x';
2747 else if (number[1] != 'x') {
2748 memmove(number + 2, number,
2749 strlen(number) + 1);
2750 number[0] = '0';
2751 number[1] = 'x';
2752 len += 2;
2753 }
2754
Victor Stinner4a587072013-11-19 12:54:53 +01002755 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002756 return NULL;
2757 break;
2758 }
2759
2760 case 's':
2761 {
2762 /* UTF-8 */
2763 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002765 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002766 break;
2767 }
2768
2769 case 'U':
2770 {
2771 PyObject *obj = va_arg(*vargs, PyObject *);
2772 assert(obj && _PyUnicode_CHECK(obj));
2773
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002774 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002775 return NULL;
2776 break;
2777 }
2778
2779 case 'V':
2780 {
2781 PyObject *obj = va_arg(*vargs, PyObject *);
2782 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002783 if (obj) {
2784 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002785 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002786 return NULL;
2787 }
2788 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002789 assert(str != NULL);
2790 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002792 }
2793 break;
2794 }
2795
2796 case 'S':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 PyObject *str;
2800 assert(obj);
2801 str = PyObject_Str(obj);
2802 if (!str)
2803 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002805 Py_DECREF(str);
2806 return NULL;
2807 }
2808 Py_DECREF(str);
2809 break;
2810 }
2811
2812 case 'R':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *repr;
2816 assert(obj);
2817 repr = PyObject_Repr(obj);
2818 if (!repr)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(repr);
2822 return NULL;
2823 }
2824 Py_DECREF(repr);
2825 break;
2826 }
2827
2828 case 'A':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *ascii;
2832 assert(obj);
2833 ascii = PyObject_ASCII(obj);
2834 if (!ascii)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(ascii);
2838 return NULL;
2839 }
2840 Py_DECREF(ascii);
2841 break;
2842 }
2843
2844 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002845 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002846 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002847 break;
2848
2849 default:
2850 /* if we stumble upon an unknown formatting code, copy the rest
2851 of the format string to the output string. (we cannot just
2852 skip the code, since there's no way to know what's in the
2853 argument list) */
2854 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002855 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002856 return NULL;
2857 f = p+len;
2858 return f;
2859 }
2860
2861 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002862 return f;
2863}
2864
Walter Dörwaldd2034312007-05-18 16:29:38 +00002865PyObject *
2866PyUnicode_FromFormatV(const char *format, va_list vargs)
2867{
Victor Stinnere215d962012-10-06 23:03:36 +02002868 va_list vargs2;
2869 const char *f;
2870 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871
Victor Stinner8f674cc2013-04-17 23:02:17 +02002872 _PyUnicodeWriter_Init(&writer);
2873 writer.min_length = strlen(format) + 100;
2874 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002875
2876 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2877 Copy it to be able to pass a reference to a subfunction. */
2878 Py_VA_COPY(vargs2, vargs);
2879
2880 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002881 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 f = unicode_fromformat_arg(&writer, f, &vargs2);
2883 if (f == NULL)
2884 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002887 const char *p;
2888 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002889
Victor Stinnere215d962012-10-06 23:03:36 +02002890 p = f;
2891 do
2892 {
2893 if ((unsigned char)*p > 127) {
2894 PyErr_Format(PyExc_ValueError,
2895 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2896 "string, got a non-ASCII byte: 0x%02x",
2897 (unsigned char)*p);
2898 return NULL;
2899 }
2900 p++;
2901 }
2902 while (*p != '\0' && *p != '%');
2903 len = p - f;
2904
2905 if (*p == '\0')
2906 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002907
2908 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002909 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 }
Victor Stinnere215d962012-10-06 23:03:36 +02002914 return _PyUnicodeWriter_Finish(&writer);
2915
2916 fail:
2917 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002918 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002919}
2920
Walter Dörwaldd2034312007-05-18 16:29:38 +00002921PyObject *
2922PyUnicode_FromFormat(const char *format, ...)
2923{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 PyObject* ret;
2925 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926
2927#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002929#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002931#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002932 ret = PyUnicode_FromFormatV(format, vargs);
2933 va_end(vargs);
2934 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935}
2936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937#ifdef HAVE_WCHAR_H
2938
Victor Stinner5593d8a2010-10-02 11:11:27 +00002939/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2940 convert a Unicode object to a wide character string.
2941
Victor Stinnerd88d9832011-09-06 02:00:05 +02002942 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002943 character) required to convert the unicode object. Ignore size argument.
2944
Victor Stinnerd88d9832011-09-06 02:00:05 +02002945 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002947 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002948static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002949unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002950 wchar_t *w,
2951 Py_ssize_t size)
2952{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002953 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 const wchar_t *wstr;
2955
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002956 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 if (wstr == NULL)
2958 return -1;
2959
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 if (size > res)
2962 size = res + 1;
2963 else
2964 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 return res;
2967 }
2968 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002970}
2971
2972Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002973PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002974 wchar_t *w,
2975 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976{
2977 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 PyErr_BadInternalCall();
2979 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002981 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982}
2983
Victor Stinner137c34c2010-09-29 10:25:54 +00002984wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002985PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002986 Py_ssize_t *size)
2987{
2988 wchar_t* buffer;
2989 Py_ssize_t buflen;
2990
2991 if (unicode == NULL) {
2992 PyErr_BadInternalCall();
2993 return NULL;
2994 }
2995
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002996 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002997 if (buflen == -1)
2998 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002999 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003000 if (buffer == NULL) {
3001 PyErr_NoMemory();
3002 return NULL;
3003 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003004 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003005 if (buflen == -1) {
3006 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003008 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003009 if (size != NULL)
3010 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003011 return buffer;
3012}
3013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015
Alexander Belopolsky40018472011-02-26 01:02:56 +00003016PyObject *
3017PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003018{
Victor Stinner8faf8212011-12-08 22:14:11 +01003019 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 PyErr_SetString(PyExc_ValueError,
3021 "chr() arg not in range(0x110000)");
3022 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003023 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003024
Victor Stinner985a82a2014-01-03 12:53:47 +01003025 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003026}
3027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003029PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003031 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003033 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003034 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003035 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 Py_INCREF(obj);
3037 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003038 }
3039 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 /* For a Unicode subtype that's not a Unicode object,
3041 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003042 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003044 PyErr_Format(PyExc_TypeError,
3045 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003046 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003047 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003048}
3049
Alexander Belopolsky40018472011-02-26 01:02:56 +00003050PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003051PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003052 const char *encoding,
3053 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003055 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003056 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003057
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 PyErr_BadInternalCall();
3060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003063 /* Decoding bytes objects is the most common case and should be fast */
3064 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003065 if (PyBytes_GET_SIZE(obj) == 0)
3066 _Py_RETURN_UNICODE_EMPTY();
3067 v = PyUnicode_Decode(
3068 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3069 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 return v;
3071 }
3072
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003073 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 PyErr_SetString(PyExc_TypeError,
3075 "decoding str is not supported");
3076 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003077 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003078
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003079 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3080 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3081 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003082 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003083 Py_TYPE(obj)->tp_name);
3084 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003085 }
Tim Petersced69f82003-09-16 20:30:58 +00003086
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003087 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003088 PyBuffer_Release(&buffer);
3089 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003091
Serhiy Storchaka05997252013-01-26 12:14:02 +02003092 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003094 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
Victor Stinner942889a2016-09-05 15:40:10 -07003097/* Normalize an encoding name: C implementation of
3098 encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3099 is longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003100int
3101_Py_normalize_encoding(const char *encoding,
3102 char *lower,
3103 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003105 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003106 char *l;
3107 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003108 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109
Victor Stinner942889a2016-09-05 15:40:10 -07003110 assert(encoding != NULL);
3111
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003112 e = encoding;
3113 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003114 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003115 punct = 0;
3116 while (1) {
3117 char c = *e;
3118 if (c == 0) {
3119 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003120 }
Victor Stinner942889a2016-09-05 15:40:10 -07003121
3122 if (Py_ISALNUM(c) || c == '.') {
3123 if (punct && l != lower) {
3124 if (l == l_end) {
3125 return 0;
3126 }
3127 *l++ = '_';
3128 }
3129 punct = 0;
3130
3131 if (l == l_end) {
3132 return 0;
3133 }
3134 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003135 }
3136 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003137 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003138 }
Victor Stinner942889a2016-09-05 15:40:10 -07003139
3140 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003143 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003144}
3145
Alexander Belopolsky40018472011-02-26 01:02:56 +00003146PyObject *
3147PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003148 Py_ssize_t size,
3149 const char *encoding,
3150 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003151{
3152 PyObject *buffer = NULL, *unicode;
3153 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003154 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3155
3156 if (encoding == NULL) {
3157 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3158 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003159
Fred Drakee4315f52000-05-09 19:53:39 +00003160 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003161 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3162 char *lower = buflower;
3163
3164 /* Fast paths */
3165 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3166 lower += 3;
3167 if (*lower == '_') {
3168 /* Match "utf8" and "utf_8" */
3169 lower++;
3170 }
3171
3172 if (lower[0] == '8' && lower[1] == 0) {
3173 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3174 }
3175 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3176 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3177 }
3178 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3179 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3180 }
3181 }
3182 else {
3183 if (strcmp(lower, "ascii") == 0
3184 || strcmp(lower, "us_ascii") == 0) {
3185 return PyUnicode_DecodeASCII(s, size, errors);
3186 }
3187 #ifdef HAVE_MBCS
3188 else if (strcmp(lower, "mbcs") == 0) {
3189 return PyUnicode_DecodeMBCS(s, size, errors);
3190 }
3191 #endif
3192 else if (strcmp(lower, "latin1") == 0
3193 || strcmp(lower, "latin_1") == 0
3194 || strcmp(lower, "iso_8859_1") == 0
3195 || strcmp(lower, "iso8859_1") == 0) {
3196 return PyUnicode_DecodeLatin1(s, size, errors);
3197 }
3198 }
Victor Stinner37296e82010-06-10 13:36:23 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200
3201 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003202 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003203 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003204 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003205 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 if (buffer == NULL)
3207 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003208 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 if (unicode == NULL)
3210 goto onError;
3211 if (!PyUnicode_Check(unicode)) {
3212 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003213 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3214 "use codecs.decode() to decode to arbitrary types",
3215 encoding,
3216 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 Py_DECREF(unicode);
3218 goto onError;
3219 }
3220 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003221 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003222
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 Py_XDECREF(buffer);
3225 return NULL;
3226}
3227
Alexander Belopolsky40018472011-02-26 01:02:56 +00003228PyObject *
3229PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003230 const char *encoding,
3231 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003232{
3233 PyObject *v;
3234
3235 if (!PyUnicode_Check(unicode)) {
3236 PyErr_BadArgument();
3237 goto onError;
3238 }
3239
3240 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242
3243 /* Decode via the codec registry */
3244 v = PyCodec_Decode(unicode, encoding, errors);
3245 if (v == NULL)
3246 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003247 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 return NULL;
3251}
3252
Alexander Belopolsky40018472011-02-26 01:02:56 +00003253PyObject *
3254PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003255 const char *encoding,
3256 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003257{
3258 PyObject *v;
3259
3260 if (!PyUnicode_Check(unicode)) {
3261 PyErr_BadArgument();
3262 goto onError;
3263 }
3264
3265 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003267
3268 /* Decode via the codec registry */
3269 v = PyCodec_Decode(unicode, encoding, errors);
3270 if (v == NULL)
3271 goto onError;
3272 if (!PyUnicode_Check(v)) {
3273 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003274 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3275 "use codecs.decode() to decode to arbitrary types",
3276 encoding,
3277 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003278 Py_DECREF(v);
3279 goto onError;
3280 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003281 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003282
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 return NULL;
3285}
3286
Alexander Belopolsky40018472011-02-26 01:02:56 +00003287PyObject *
3288PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003289 Py_ssize_t size,
3290 const char *encoding,
3291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292{
3293 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003294
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 unicode = PyUnicode_FromUnicode(s, size);
3296 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3299 Py_DECREF(unicode);
3300 return v;
3301}
3302
Alexander Belopolsky40018472011-02-26 01:02:56 +00003303PyObject *
3304PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003305 const char *encoding,
3306 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003307{
3308 PyObject *v;
3309
3310 if (!PyUnicode_Check(unicode)) {
3311 PyErr_BadArgument();
3312 goto onError;
3313 }
3314
3315 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003317
3318 /* Encode via the codec registry */
3319 v = PyCodec_Encode(unicode, encoding, errors);
3320 if (v == NULL)
3321 goto onError;
3322 return v;
3323
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003325 return NULL;
3326}
3327
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003328static size_t
3329wcstombs_errorpos(const wchar_t *wstr)
3330{
3331 size_t len;
3332#if SIZEOF_WCHAR_T == 2
3333 wchar_t buf[3];
3334#else
3335 wchar_t buf[2];
3336#endif
3337 char outbuf[MB_LEN_MAX];
3338 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003339
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003340#if SIZEOF_WCHAR_T == 2
3341 buf[2] = 0;
3342#else
3343 buf[1] = 0;
3344#endif
3345 start = wstr;
3346 while (*wstr != L'\0')
3347 {
3348 previous = wstr;
3349#if SIZEOF_WCHAR_T == 2
3350 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3351 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3352 {
3353 buf[0] = wstr[0];
3354 buf[1] = wstr[1];
3355 wstr += 2;
3356 }
3357 else {
3358 buf[0] = *wstr;
3359 buf[1] = 0;
3360 wstr++;
3361 }
3362#else
3363 buf[0] = *wstr;
3364 wstr++;
3365#endif
3366 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003367 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003368 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003369 }
3370
3371 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003372 return 0;
3373}
3374
Victor Stinner1b579672011-12-17 05:47:23 +01003375static int
3376locale_error_handler(const char *errors, int *surrogateescape)
3377{
Victor Stinner50149202015-09-22 00:26:54 +02003378 _Py_error_handler error_handler = get_error_handler(errors);
3379 switch (error_handler)
3380 {
3381 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003382 *surrogateescape = 0;
3383 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003384 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003385 *surrogateescape = 1;
3386 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003387 default:
3388 PyErr_Format(PyExc_ValueError,
3389 "only 'strict' and 'surrogateescape' error handlers "
3390 "are supported, not '%s'",
3391 errors);
3392 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003393 }
Victor Stinner1b579672011-12-17 05:47:23 +01003394}
3395
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003396PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003397PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003398{
3399 Py_ssize_t wlen, wlen2;
3400 wchar_t *wstr;
3401 PyObject *bytes = NULL;
3402 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003403 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003404 PyObject *exc;
3405 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003406 int surrogateescape;
3407
3408 if (locale_error_handler(errors, &surrogateescape) < 0)
3409 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410
3411 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3412 if (wstr == NULL)
3413 return NULL;
3414
3415 wlen2 = wcslen(wstr);
3416 if (wlen2 != wlen) {
3417 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003418 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 return NULL;
3420 }
3421
3422 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003423 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 char *str;
3425
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003426 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003427 if (str == NULL) {
3428 if (error_pos == (size_t)-1) {
3429 PyErr_NoMemory();
3430 PyMem_Free(wstr);
3431 return NULL;
3432 }
3433 else {
3434 goto encode_error;
3435 }
3436 }
3437 PyMem_Free(wstr);
3438
3439 bytes = PyBytes_FromString(str);
3440 PyMem_Free(str);
3441 }
3442 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003443 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003444 size_t len, len2;
3445
3446 len = wcstombs(NULL, wstr, 0);
3447 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003448 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 goto encode_error;
3450 }
3451
3452 bytes = PyBytes_FromStringAndSize(NULL, len);
3453 if (bytes == NULL) {
3454 PyMem_Free(wstr);
3455 return NULL;
3456 }
3457
3458 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3459 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003460 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461 goto encode_error;
3462 }
3463 PyMem_Free(wstr);
3464 }
3465 return bytes;
3466
3467encode_error:
3468 errmsg = strerror(errno);
3469 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003470
3471 if (error_pos == (size_t)-1)
3472 error_pos = wcstombs_errorpos(wstr);
3473
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003474 PyMem_Free(wstr);
3475 Py_XDECREF(bytes);
3476
Victor Stinner2f197072011-12-17 07:08:30 +01003477 if (errmsg != NULL) {
3478 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003479 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003480 if (wstr != NULL) {
3481 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003482 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003483 } else
3484 errmsg = NULL;
3485 }
3486 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003487 reason = PyUnicode_FromString(
3488 "wcstombs() encountered an unencodable "
3489 "wide character");
3490 if (reason == NULL)
3491 return NULL;
3492
3493 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3494 "locale", unicode,
3495 (Py_ssize_t)error_pos,
3496 (Py_ssize_t)(error_pos+1),
3497 reason);
3498 Py_DECREF(reason);
3499 if (exc != NULL) {
3500 PyCodec_StrictErrors(exc);
3501 Py_XDECREF(exc);
3502 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003503 return NULL;
3504}
3505
Victor Stinnerad158722010-10-27 00:25:46 +00003506PyObject *
3507PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003508{
Victor Stinner99b95382011-07-04 14:23:54 +02003509#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003510 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003511#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003513#else
Victor Stinner793b5312011-04-27 00:24:21 +02003514 PyInterpreterState *interp = PyThreadState_GET()->interp;
3515 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3516 cannot use it to encode and decode filenames before it is loaded. Load
3517 the Python codec requires to encode at least its own filename. Use the C
3518 version of the locale codec until the codec registry is initialized and
3519 the Python codec is loaded.
3520
3521 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3522 cannot only rely on it: check also interp->fscodec_initialized for
3523 subinterpreters. */
3524 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003525 return PyUnicode_AsEncodedString(unicode,
3526 Py_FileSystemDefaultEncoding,
3527 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003528 }
3529 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003530 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003531 }
Victor Stinnerad158722010-10-27 00:25:46 +00003532#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003533}
3534
Alexander Belopolsky40018472011-02-26 01:02:56 +00003535PyObject *
3536PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003537 const char *encoding,
3538 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539{
3540 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003541 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003542
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 if (!PyUnicode_Check(unicode)) {
3544 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 }
Fred Drakee4315f52000-05-09 19:53:39 +00003547
Victor Stinner942889a2016-09-05 15:40:10 -07003548 if (encoding == NULL) {
3549 return _PyUnicode_AsUTF8String(unicode, errors);
3550 }
3551
Fred Drakee4315f52000-05-09 19:53:39 +00003552 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003553 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3554 char *lower = buflower;
3555
3556 /* Fast paths */
3557 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3558 lower += 3;
3559 if (*lower == '_') {
3560 /* Match "utf8" and "utf_8" */
3561 lower++;
3562 }
3563
3564 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003566 }
3567 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3568 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3569 }
3570 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3571 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3572 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003573 }
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else {
3575 if (strcmp(lower, "ascii") == 0
3576 || strcmp(lower, "us_ascii") == 0) {
3577 return _PyUnicode_AsASCIIString(unicode, errors);
3578 }
Victor Stinner99b95382011-07-04 14:23:54 +02003579#ifdef HAVE_MBCS
Victor Stinner942889a2016-09-05 15:40:10 -07003580 else if (strcmp(lower, "mbcs") == 0) {
3581 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3582 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003583#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003584 else if (strcmp(lower, "latin1") == 0 ||
3585 strcmp(lower, "latin_1") == 0 ||
3586 strcmp(lower, "iso_8859_1") == 0 ||
3587 strcmp(lower, "iso8859_1") == 0) {
3588 return _PyUnicode_AsLatin1String(unicode, errors);
3589 }
3590 }
Victor Stinner37296e82010-06-10 13:36:23 +00003591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592
3593 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003594 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003596 return NULL;
3597
3598 /* The normal path */
3599 if (PyBytes_Check(v))
3600 return v;
3601
3602 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003603 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003604 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003605 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003606
3607 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003608 "encoder %s returned bytearray instead of bytes; "
3609 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003610 encoding);
3611 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 Py_DECREF(v);
3613 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003614 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003616 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3617 Py_DECREF(v);
3618 return b;
3619 }
3620
3621 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003622 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3623 "use codecs.encode() to encode to arbitrary types",
3624 encoding,
3625 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003626 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627 return NULL;
3628}
3629
Alexander Belopolsky40018472011-02-26 01:02:56 +00003630PyObject *
3631PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003632 const char *encoding,
3633 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634{
3635 PyObject *v;
3636
3637 if (!PyUnicode_Check(unicode)) {
3638 PyErr_BadArgument();
3639 goto onError;
3640 }
3641
3642 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644
3645 /* Encode via the codec registry */
3646 v = PyCodec_Encode(unicode, encoding, errors);
3647 if (v == NULL)
3648 goto onError;
3649 if (!PyUnicode_Check(v)) {
3650 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003651 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3652 "use codecs.encode() to encode to arbitrary types",
3653 encoding,
3654 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655 Py_DECREF(v);
3656 goto onError;
3657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 return NULL;
3662}
3663
Victor Stinner2f197072011-12-17 07:08:30 +01003664static size_t
3665mbstowcs_errorpos(const char *str, size_t len)
3666{
3667#ifdef HAVE_MBRTOWC
3668 const char *start = str;
3669 mbstate_t mbs;
3670 size_t converted;
3671 wchar_t ch;
3672
3673 memset(&mbs, 0, sizeof mbs);
3674 while (len)
3675 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003676 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003677 if (converted == 0)
3678 /* Reached end of string */
3679 break;
3680 if (converted == (size_t)-1 || converted == (size_t)-2) {
3681 /* Conversion error or incomplete character */
3682 return str - start;
3683 }
3684 else {
3685 str += converted;
3686 len -= converted;
3687 }
3688 }
3689 /* failed to find the undecodable byte sequence */
3690 return 0;
3691#endif
3692 return 0;
3693}
3694
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003695PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003696PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003697 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003698{
3699 wchar_t smallbuf[256];
3700 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3701 wchar_t *wstr;
3702 size_t wlen, wlen2;
3703 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003704 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003705 size_t error_pos;
3706 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003707 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3708 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003709
3710 if (locale_error_handler(errors, &surrogateescape) < 0)
3711 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003712
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003713 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3714 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715 return NULL;
3716 }
3717
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003718 if (surrogateescape) {
3719 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003720 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721 if (wstr == NULL) {
3722 if (wlen == (size_t)-1)
3723 PyErr_NoMemory();
3724 else
3725 PyErr_SetFromErrno(PyExc_OSError);
3726 return NULL;
3727 }
3728
3729 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003730 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003731 }
3732 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003733 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003734#ifndef HAVE_BROKEN_MBSTOWCS
3735 wlen = mbstowcs(NULL, str, 0);
3736#else
3737 wlen = len;
3738#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003739 if (wlen == (size_t)-1)
3740 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741 if (wlen+1 <= smallbuf_len) {
3742 wstr = smallbuf;
3743 }
3744 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003745 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746 if (!wstr)
3747 return PyErr_NoMemory();
3748 }
3749
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003750 wlen2 = mbstowcs(wstr, str, wlen+1);
3751 if (wlen2 == (size_t)-1) {
3752 if (wstr != smallbuf)
3753 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003754 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755 }
3756#ifdef HAVE_BROKEN_MBSTOWCS
3757 assert(wlen2 == wlen);
3758#endif
3759 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3760 if (wstr != smallbuf)
3761 PyMem_Free(wstr);
3762 }
3763 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003764
3765decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003766 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003767 errmsg = strerror(errno);
3768 assert(errmsg != NULL);
3769
3770 error_pos = mbstowcs_errorpos(str, len);
3771 if (errmsg != NULL) {
3772 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003773 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003774 if (wstr != NULL) {
3775 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003776 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003777 }
Victor Stinner2f197072011-12-17 07:08:30 +01003778 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003779 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003780 reason = PyUnicode_FromString(
3781 "mbstowcs() encountered an invalid multibyte sequence");
3782 if (reason == NULL)
3783 return NULL;
3784
3785 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3786 "locale", str, len,
3787 (Py_ssize_t)error_pos,
3788 (Py_ssize_t)(error_pos+1),
3789 reason);
3790 Py_DECREF(reason);
3791 if (exc != NULL) {
3792 PyCodec_StrictErrors(exc);
3793 Py_XDECREF(exc);
3794 }
3795 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003796}
3797
3798PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003799PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003800{
3801 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003802 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003803}
3804
3805
3806PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003807PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003808 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003809 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3810}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003811
Christian Heimes5894ba72007-11-04 11:43:14 +00003812PyObject*
3813PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3814{
Victor Stinner99b95382011-07-04 14:23:54 +02003815#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003816 return PyUnicode_DecodeMBCS(s, size, NULL);
3817#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003818 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003819#else
Victor Stinner793b5312011-04-27 00:24:21 +02003820 PyInterpreterState *interp = PyThreadState_GET()->interp;
3821 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3822 cannot use it to encode and decode filenames before it is loaded. Load
3823 the Python codec requires to encode at least its own filename. Use the C
3824 version of the locale codec until the codec registry is initialized and
3825 the Python codec is loaded.
3826
3827 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3828 cannot only rely on it: check also interp->fscodec_initialized for
3829 subinterpreters. */
3830 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003831 return PyUnicode_Decode(s, size,
3832 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003833 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003834 }
3835 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003836 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 }
Victor Stinnerad158722010-10-27 00:25:46 +00003838#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003839}
3840
Martin v. Löwis011e8422009-05-05 04:43:17 +00003841
3842int
3843PyUnicode_FSConverter(PyObject* arg, void* addr)
3844{
Brett Cannonec6ce872016-09-06 15:50:29 -07003845 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003846 PyObject *output = NULL;
3847 Py_ssize_t size;
3848 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003849 if (arg == NULL) {
3850 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003851 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003852 return 1;
3853 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003854 path = PyOS_FSPath(arg);
3855 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003856 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003857 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003858 if (PyBytes_Check(path)) {
3859 output = path;
3860 }
3861 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3862 output = PyUnicode_EncodeFSDefault(path);
3863 Py_DECREF(path);
3864 if (!output) {
3865 return 0;
3866 }
3867 assert(PyBytes_Check(output));
3868 }
3869
Victor Stinner0ea2a462010-04-30 00:22:08 +00003870 size = PyBytes_GET_SIZE(output);
3871 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003872 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003873 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003874 Py_DECREF(output);
3875 return 0;
3876 }
3877 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003878 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003879}
3880
3881
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003882int
3883PyUnicode_FSDecoder(PyObject* arg, void* addr)
3884{
3885 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003886 if (arg == NULL) {
3887 Py_DECREF(*(PyObject**)addr);
3888 return 1;
3889 }
3890 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003891 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003893 output = arg;
3894 Py_INCREF(output);
3895 }
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003896 else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) {
3897 if (!PyBytes_Check(arg) &&
3898 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3899 "path should be string or bytes, not %.200s",
3900 Py_TYPE(arg)->tp_name)) {
3901 return 0;
3902 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003903 arg = PyBytes_FromObject(arg);
3904 if (!arg)
3905 return 0;
3906 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3907 PyBytes_GET_SIZE(arg));
3908 Py_DECREF(arg);
3909 if (!output)
3910 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003911 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003912 else {
3913 PyErr_Format(PyExc_TypeError,
3914 "path should be string or bytes, not %.200s",
3915 Py_TYPE(arg)->tp_name);
3916 return 0;
3917 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003918 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003919 Py_DECREF(output);
3920 return 0;
3921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003923 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003924 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003925 Py_DECREF(output);
3926 return 0;
3927 }
3928 *(PyObject**)addr = output;
3929 return Py_CLEANUP_SUPPORTED;
3930}
3931
3932
Martin v. Löwis5b222132007-06-10 09:51:05 +00003933char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003935{
Christian Heimesf3863112007-11-22 07:46:41 +00003936 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003938 if (!PyUnicode_Check(unicode)) {
3939 PyErr_BadArgument();
3940 return NULL;
3941 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003942 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003943 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003945 if (PyUnicode_UTF8(unicode) == NULL) {
3946 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003947 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 if (bytes == NULL)
3949 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3951 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003952 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953 Py_DECREF(bytes);
3954 return NULL;
3955 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003956 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3957 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3958 PyBytes_AS_STRING(bytes),
3959 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 Py_DECREF(bytes);
3961 }
3962
3963 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003964 *psize = PyUnicode_UTF8_LENGTH(unicode);
3965 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003966}
3967
3968char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3972}
3973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974Py_UNICODE *
3975PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977 const unsigned char *one_byte;
3978#if SIZEOF_WCHAR_T == 4
3979 const Py_UCS2 *two_bytes;
3980#else
3981 const Py_UCS4 *four_bytes;
3982 const Py_UCS4 *ucs4_end;
3983 Py_ssize_t num_surrogates;
3984#endif
3985 wchar_t *w;
3986 wchar_t *wchar_end;
3987
3988 if (!PyUnicode_Check(unicode)) {
3989 PyErr_BadArgument();
3990 return NULL;
3991 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003992 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003994 assert(_PyUnicode_KIND(unicode) != 0);
3995 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4000 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 num_surrogates = 0;
4002
4003 for (; four_bytes < ucs4_end; ++four_bytes) {
4004 if (*four_bytes > 0xFFFF)
4005 ++num_surrogates;
4006 }
4007
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004008 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4009 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4010 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 PyErr_NoMemory();
4012 return NULL;
4013 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004014 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004016 w = _PyUnicode_WSTR(unicode);
4017 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4018 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4020 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004021 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004023 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4024 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025 }
4026 else
4027 *w = *four_bytes;
4028
4029 if (w > wchar_end) {
4030 assert(0 && "Miscalculated string end");
4031 }
4032 }
4033 *w = 0;
4034#else
4035 /* sizeof(wchar_t) == 4 */
4036 Py_FatalError("Impossible unicode object state, wstr and str "
4037 "should share memory already.");
4038 return NULL;
4039#endif
4040 }
4041 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004042 if ((size_t)_PyUnicode_LENGTH(unicode) >
4043 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4044 PyErr_NoMemory();
4045 return NULL;
4046 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004047 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4048 (_PyUnicode_LENGTH(unicode) + 1));
4049 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyErr_NoMemory();
4051 return NULL;
4052 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4054 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4055 w = _PyUnicode_WSTR(unicode);
4056 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004058 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4059 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 for (; w < wchar_end; ++one_byte, ++w)
4061 *w = *one_byte;
4062 /* null-terminate the wstr */
4063 *w = 0;
4064 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004065 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004067 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 for (; w < wchar_end; ++two_bytes, ++w)
4069 *w = *two_bytes;
4070 /* null-terminate the wstr */
4071 *w = 0;
4072#else
4073 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004074 PyObject_FREE(_PyUnicode_WSTR(unicode));
4075 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 Py_FatalError("Impossible unicode object state, wstr "
4077 "and str should share memory already.");
4078 return NULL;
4079#endif
4080 }
4081 else {
4082 assert(0 && "This should never happen.");
4083 }
4084 }
4085 }
4086 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004087 *size = PyUnicode_WSTR_LENGTH(unicode);
4088 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004089}
4090
Alexander Belopolsky40018472011-02-26 01:02:56 +00004091Py_UNICODE *
4092PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095}
4096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097
Alexander Belopolsky40018472011-02-26 01:02:56 +00004098Py_ssize_t
4099PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100{
4101 if (!PyUnicode_Check(unicode)) {
4102 PyErr_BadArgument();
4103 goto onError;
4104 }
4105 return PyUnicode_GET_SIZE(unicode);
4106
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 return -1;
4109}
4110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111Py_ssize_t
4112PyUnicode_GetLength(PyObject *unicode)
4113{
Victor Stinner07621332012-06-16 04:53:46 +02004114 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 PyErr_BadArgument();
4116 return -1;
4117 }
Victor Stinner07621332012-06-16 04:53:46 +02004118 if (PyUnicode_READY(unicode) == -1)
4119 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 return PyUnicode_GET_LENGTH(unicode);
4121}
4122
4123Py_UCS4
4124PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4125{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004126 void *data;
4127 int kind;
4128
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004129 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4130 PyErr_BadArgument();
4131 return (Py_UCS4)-1;
4132 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004133 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004134 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135 return (Py_UCS4)-1;
4136 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004137 data = PyUnicode_DATA(unicode);
4138 kind = PyUnicode_KIND(unicode);
4139 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140}
4141
4142int
4143PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4144{
4145 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004146 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 return -1;
4148 }
Victor Stinner488fa492011-12-12 00:01:39 +01004149 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004150 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004151 PyErr_SetString(PyExc_IndexError, "string index out of range");
4152 return -1;
4153 }
Victor Stinner488fa492011-12-12 00:01:39 +01004154 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004155 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004156 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4157 PyErr_SetString(PyExc_ValueError, "character out of range");
4158 return -1;
4159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004160 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4161 index, ch);
4162 return 0;
4163}
4164
Alexander Belopolsky40018472011-02-26 01:02:56 +00004165const char *
4166PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004167{
Victor Stinner42cb4622010-09-01 19:39:01 +00004168 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004169}
4170
Victor Stinner554f3f02010-06-16 23:33:54 +00004171/* create or adjust a UnicodeDecodeError */
4172static void
4173make_decode_exception(PyObject **exceptionObject,
4174 const char *encoding,
4175 const char *input, Py_ssize_t length,
4176 Py_ssize_t startpos, Py_ssize_t endpos,
4177 const char *reason)
4178{
4179 if (*exceptionObject == NULL) {
4180 *exceptionObject = PyUnicodeDecodeError_Create(
4181 encoding, input, length, startpos, endpos, reason);
4182 }
4183 else {
4184 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4185 goto onError;
4186 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4187 goto onError;
4188 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4189 goto onError;
4190 }
4191 return;
4192
4193onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004194 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004195}
4196
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004197#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198/* error handling callback helper:
4199 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004200 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 and adjust various state variables.
4202 return 0 on success, -1 on error
4203*/
4204
Alexander Belopolsky40018472011-02-26 01:02:56 +00004205static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004206unicode_decode_call_errorhandler_wchar(
4207 const char *errors, PyObject **errorHandler,
4208 const char *encoding, const char *reason,
4209 const char **input, const char **inend, Py_ssize_t *startinpos,
4210 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4211 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004213 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214
4215 PyObject *restuple = NULL;
4216 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004217 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004218 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004219 Py_ssize_t requiredsize;
4220 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004221 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004222 wchar_t *repwstr;
4223 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004225 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4226 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 *errorHandler = PyCodec_LookupError(errors);
4230 if (*errorHandler == NULL)
4231 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 }
4233
Victor Stinner554f3f02010-06-16 23:33:54 +00004234 make_decode_exception(exceptionObject,
4235 encoding,
4236 *input, *inend - *input,
4237 *startinpos, *endinpos,
4238 reason);
4239 if (*exceptionObject == NULL)
4240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241
4242 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4243 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004246 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 }
4249 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251
4252 /* Copy back the bytes variables, which might have been modified by the
4253 callback */
4254 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4255 if (!inputobj)
4256 goto onError;
4257 if (!PyBytes_Check(inputobj)) {
4258 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4259 }
4260 *input = PyBytes_AS_STRING(inputobj);
4261 insize = PyBytes_GET_SIZE(inputobj);
4262 *inend = *input + insize;
4263 /* we can DECREF safely, as the exception has another reference,
4264 so the object won't go away. */
4265 Py_DECREF(inputobj);
4266
4267 if (newpos<0)
4268 newpos = insize+newpos;
4269 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004270 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 goto onError;
4272 }
4273
4274 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4275 if (repwstr == NULL)
4276 goto onError;
4277 /* need more space? (at least enough for what we
4278 have+the replacement+the rest of the string (starting
4279 at the new input position), so we won't have to check space
4280 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004281 requiredsize = *outpos;
4282 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4283 goto overflow;
4284 requiredsize += repwlen;
4285 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4286 goto overflow;
4287 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004289 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290 requiredsize = 2*outsize;
4291 if (unicode_resize(output, requiredsize) < 0)
4292 goto onError;
4293 }
4294 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4295 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 *endinpos = newpos;
4297 *inptr = *input + newpos;
4298
4299 /* we made it! */
4300 Py_XDECREF(restuple);
4301 return 0;
4302
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004303 overflow:
4304 PyErr_SetString(PyExc_OverflowError,
4305 "decoded result is too long for a Python string");
4306
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307 onError:
4308 Py_XDECREF(restuple);
4309 return -1;
4310}
4311#endif /* HAVE_MBCS */
4312
4313static int
4314unicode_decode_call_errorhandler_writer(
4315 const char *errors, PyObject **errorHandler,
4316 const char *encoding, const char *reason,
4317 const char **input, const char **inend, Py_ssize_t *startinpos,
4318 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4319 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4320{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004321 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322
4323 PyObject *restuple = NULL;
4324 PyObject *repunicode = NULL;
4325 Py_ssize_t insize;
4326 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004327 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 PyObject *inputobj = NULL;
4329
4330 if (*errorHandler == NULL) {
4331 *errorHandler = PyCodec_LookupError(errors);
4332 if (*errorHandler == NULL)
4333 goto onError;
4334 }
4335
4336 make_decode_exception(exceptionObject,
4337 encoding,
4338 *input, *inend - *input,
4339 *startinpos, *endinpos,
4340 reason);
4341 if (*exceptionObject == NULL)
4342 goto onError;
4343
4344 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4345 if (restuple == NULL)
4346 goto onError;
4347 if (!PyTuple_Check(restuple)) {
4348 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4349 goto onError;
4350 }
4351 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004352 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004353
4354 /* Copy back the bytes variables, which might have been modified by the
4355 callback */
4356 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4357 if (!inputobj)
4358 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004359 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004361 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004362 *input = PyBytes_AS_STRING(inputobj);
4363 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004364 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004365 /* we can DECREF safely, as the exception has another reference,
4366 so the object won't go away. */
4367 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004371 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004372 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375
Victor Stinner8f674cc2013-04-17 23:02:17 +02004376 if (PyUnicode_READY(repunicode) < 0)
4377 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004378 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004379 if (replen > 1) {
4380 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004381 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004382 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4383 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4384 goto onError;
4385 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004386 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004387 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004390 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004393 Py_XDECREF(restuple);
4394 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399}
4400
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401/* --- UTF-7 Codec -------------------------------------------------------- */
4402
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403/* See RFC2152 for details. We encode conservatively and decode liberally. */
4404
4405/* Three simple macros defining base-64. */
4406
4407/* Is c a base-64 character? */
4408
4409#define IS_BASE64(c) \
4410 (((c) >= 'A' && (c) <= 'Z') || \
4411 ((c) >= 'a' && (c) <= 'z') || \
4412 ((c) >= '0' && (c) <= '9') || \
4413 (c) == '+' || (c) == '/')
4414
4415/* given that c is a base-64 character, what is its base-64 value? */
4416
4417#define FROM_BASE64(c) \
4418 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4419 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4420 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4421 (c) == '+' ? 62 : 63)
4422
4423/* What is the base-64 character of the bottom 6 bits of n? */
4424
4425#define TO_BASE64(n) \
4426 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4427
4428/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4429 * decoded as itself. We are permissive on decoding; the only ASCII
4430 * byte not decoding to itself is the + which begins a base64
4431 * string. */
4432
4433#define DECODE_DIRECT(c) \
4434 ((c) <= 127 && (c) != '+')
4435
4436/* The UTF-7 encoder treats ASCII characters differently according to
4437 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4438 * the above). See RFC2152. This array identifies these different
4439 * sets:
4440 * 0 : "Set D"
4441 * alphanumeric and '(),-./:?
4442 * 1 : "Set O"
4443 * !"#$%&*;<=>@[]^_`{|}
4444 * 2 : "whitespace"
4445 * ht nl cr sp
4446 * 3 : special (must be base64 encoded)
4447 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4448 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449
Tim Petersced69f82003-09-16 20:30:58 +00004450static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451char utf7_category[128] = {
4452/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4453 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4454/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4455 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4456/* sp ! " # $ % & ' ( ) * + , - . / */
4457 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4458/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4460/* @ A B C D E F G H I J K L M N O */
4461 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4462/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4463 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4464/* ` a b c d e f g h i j k l m n o */
4465 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4466/* p q r s t u v w x y z { | } ~ del */
4467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468};
4469
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470/* ENCODE_DIRECT: this character should be encoded as itself. The
4471 * answer depends on whether we are encoding set O as itself, and also
4472 * on whether we are encoding whitespace as itself. RFC2152 makes it
4473 * clear that the answers to these questions vary between
4474 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004475
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476#define ENCODE_DIRECT(c, directO, directWS) \
4477 ((c) < 128 && (c) > 0 && \
4478 ((utf7_category[(c)] == 0) || \
4479 (directWS && (utf7_category[(c)] == 2)) || \
4480 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481
Alexander Belopolsky40018472011-02-26 01:02:56 +00004482PyObject *
4483PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004484 Py_ssize_t size,
4485 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004487 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4488}
4489
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490/* The decoder. The only state we preserve is our read position,
4491 * i.e. how many characters we have consumed. So if we end in the
4492 * middle of a shift sequence we have to back off the read position
4493 * and the output to the beginning of the sequence, otherwise we lose
4494 * all the shift state (seen bits, number of bits seen, high
4495 * surrogate). */
4496
Alexander Belopolsky40018472011-02-26 01:02:56 +00004497PyObject *
4498PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004499 Py_ssize_t size,
4500 const char *errors,
4501 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004504 Py_ssize_t startinpos;
4505 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004507 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 const char *errmsg = "";
4509 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004510 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 unsigned int base64bits = 0;
4512 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004513 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 PyObject *errorHandler = NULL;
4515 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004517 if (size == 0) {
4518 if (consumed)
4519 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004520 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004521 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004523 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004524 _PyUnicodeWriter_Init(&writer);
4525 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004526
4527 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 e = s + size;
4529
4530 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004531 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004533 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 if (inShift) { /* in a base-64 section */
4536 if (IS_BASE64(ch)) { /* consume a base-64 character */
4537 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4538 base64bits += 6;
4539 s++;
4540 if (base64bits >= 16) {
4541 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004542 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 base64bits -= 16;
4544 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004545 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 if (surrogate) {
4547 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004548 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4549 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004550 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004551 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004553 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 }
4555 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004556 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004557 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 }
4560 }
Victor Stinner551ac952011-11-29 22:58:13 +01004561 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 /* first surrogate */
4563 surrogate = outCh;
4564 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004566 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004567 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 }
4569 }
4570 }
4571 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (base64bits > 0) { /* left-over bits */
4574 if (base64bits >= 6) {
4575 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004576 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 errmsg = "partial character in shift sequence";
4578 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 else {
4581 /* Some bits remain; they should be zero */
4582 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004583 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 errmsg = "non-zero padding bits in shift sequence";
4585 goto utf7Error;
4586 }
4587 }
4588 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004589 if (surrogate && DECODE_DIRECT(ch)) {
4590 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4591 goto onError;
4592 }
4593 surrogate = 0;
4594 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 /* '-' is absorbed; other terminating
4596 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004597 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599 }
4600 }
4601 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 s++; /* consume '+' */
4604 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004606 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004607 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 }
4609 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004611 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004612 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004614 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615 }
4616 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004619 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004620 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 else {
4623 startinpos = s-starts;
4624 s++;
4625 errmsg = "unexpected special character";
4626 goto utf7Error;
4627 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004631 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 errors, &errorHandler,
4633 "utf7", errmsg,
4634 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
4638
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 /* end of string */
4640
4641 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4642 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004643 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 if (surrogate ||
4645 (base64bits >= 6) ||
4646 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 errors, &errorHandler,
4650 "utf7", "unterminated shift sequence",
4651 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 goto onError;
4654 if (s < e)
4655 goto restart;
4656 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658
4659 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004660 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004662 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004663 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004664 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004665 writer.kind, writer.data, shiftOutStart);
4666 Py_XDECREF(errorHandler);
4667 Py_XDECREF(exc);
4668 _PyUnicodeWriter_Dealloc(&writer);
4669 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004670 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004671 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 }
4673 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004674 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004676 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 Py_XDECREF(errorHandler);
4679 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683 Py_XDECREF(errorHandler);
4684 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004686 return NULL;
4687}
4688
4689
Alexander Belopolsky40018472011-02-26 01:02:56 +00004690PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004691_PyUnicode_EncodeUTF7(PyObject *str,
4692 int base64SetO,
4693 int base64WhiteSpace,
4694 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004696 int kind;
4697 void *data;
4698 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004699 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004700 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004701 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 unsigned int base64bits = 0;
4703 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704 char * out;
4705 char * start;
4706
Benjamin Petersonbac79492012-01-14 13:34:47 -05004707 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004708 return NULL;
4709 kind = PyUnicode_KIND(str);
4710 data = PyUnicode_DATA(str);
4711 len = PyUnicode_GET_LENGTH(str);
4712
4713 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004716 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004717 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004718 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004719 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720 if (v == NULL)
4721 return NULL;
4722
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004723 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004724 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004725 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004726
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 if (inShift) {
4728 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4729 /* shifting out */
4730 if (base64bits) { /* output remaining bits */
4731 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4732 base64buffer = 0;
4733 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004734 }
4735 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 /* Characters not in the BASE64 set implicitly unshift the sequence
4737 so no '-' is required, except if the character is itself a '-' */
4738 if (IS_BASE64(ch) || ch == '-') {
4739 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 *out++ = (char) ch;
4742 }
4743 else {
4744 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004745 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004747 else { /* not in a shift sequence */
4748 if (ch == '+') {
4749 *out++ = '+';
4750 *out++ = '-';
4751 }
4752 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4753 *out++ = (char) ch;
4754 }
4755 else {
4756 *out++ = '+';
4757 inShift = 1;
4758 goto encode_char;
4759 }
4760 }
4761 continue;
4762encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004764 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004765
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 /* code first surrogate */
4767 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004768 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769 while (base64bits >= 6) {
4770 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4771 base64bits -= 6;
4772 }
4773 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004774 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 base64bits += 16;
4777 base64buffer = (base64buffer << 16) | ch;
4778 while (base64bits >= 6) {
4779 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4780 base64bits -= 6;
4781 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004782 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004783 if (base64bits)
4784 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4785 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004787 if (_PyBytes_Resize(&v, out - start) < 0)
4788 return NULL;
4789 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004791PyObject *
4792PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4793 Py_ssize_t size,
4794 int base64SetO,
4795 int base64WhiteSpace,
4796 const char *errors)
4797{
4798 PyObject *result;
4799 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4800 if (tmp == NULL)
4801 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004802 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004803 base64WhiteSpace, errors);
4804 Py_DECREF(tmp);
4805 return result;
4806}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004807
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808#undef IS_BASE64
4809#undef FROM_BASE64
4810#undef TO_BASE64
4811#undef DECODE_DIRECT
4812#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814/* --- UTF-8 Codec -------------------------------------------------------- */
4815
Alexander Belopolsky40018472011-02-26 01:02:56 +00004816PyObject *
4817PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004818 Py_ssize_t size,
4819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820{
Walter Dörwald69652032004-09-07 20:24:22 +00004821 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4822}
4823
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824#include "stringlib/asciilib.h"
4825#include "stringlib/codecs.h"
4826#include "stringlib/undef.h"
4827
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004828#include "stringlib/ucs1lib.h"
4829#include "stringlib/codecs.h"
4830#include "stringlib/undef.h"
4831
4832#include "stringlib/ucs2lib.h"
4833#include "stringlib/codecs.h"
4834#include "stringlib/undef.h"
4835
4836#include "stringlib/ucs4lib.h"
4837#include "stringlib/codecs.h"
4838#include "stringlib/undef.h"
4839
Antoine Pitrouab868312009-01-10 15:40:25 +00004840/* Mask to quickly check whether a C 'long' contains a
4841 non-ASCII, UTF8-encoded char. */
4842#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004843# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004844#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004845# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004846#else
4847# error C 'long' size should be either 4 or 8!
4848#endif
4849
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850static Py_ssize_t
4851ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004852{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004854 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004855
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004856 /*
4857 * Issue #17237: m68k is a bit different from most architectures in
4858 * that objects do not use "natural alignment" - for example, int and
4859 * long are only aligned at 2-byte boundaries. Therefore the assert()
4860 * won't work; also, tests have shown that skipping the "optimised
4861 * version" will even speed up m68k.
4862 */
4863#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004865 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4866 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004867 /* Fast path, see in STRINGLIB(utf8_decode) for
4868 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004869 /* Help allocation */
4870 const char *_p = p;
4871 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 while (_p < aligned_end) {
4873 unsigned long value = *(const unsigned long *) _p;
4874 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004876 *((unsigned long *)q) = value;
4877 _p += SIZEOF_LONG;
4878 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004879 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 p = _p;
4881 while (p < end) {
4882 if ((unsigned char)*p & 0x80)
4883 break;
4884 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004886 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004889#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 while (p < end) {
4891 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4892 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004893 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004894 /* Help allocation */
4895 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 while (_p < aligned_end) {
4897 unsigned long value = *(unsigned long *) _p;
4898 if (value & ASCII_CHAR_MASK)
4899 break;
4900 _p += SIZEOF_LONG;
4901 }
4902 p = _p;
4903 if (_p == end)
4904 break;
4905 }
4906 if ((unsigned char)*p & 0x80)
4907 break;
4908 ++p;
4909 }
4910 memcpy(dest, start, p - start);
4911 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912}
Antoine Pitrouab868312009-01-10 15:40:25 +00004913
Victor Stinner785938e2011-12-11 20:09:03 +01004914PyObject *
4915PyUnicode_DecodeUTF8Stateful(const char *s,
4916 Py_ssize_t size,
4917 const char *errors,
4918 Py_ssize_t *consumed)
4919{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004921 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923
4924 Py_ssize_t startinpos;
4925 Py_ssize_t endinpos;
4926 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004927 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004929 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004930
4931 if (size == 0) {
4932 if (consumed)
4933 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004934 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004935 }
4936
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4938 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004939 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 *consumed = 1;
4941 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004942 }
4943
Victor Stinner8f674cc2013-04-17 23:02:17 +02004944 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004945 writer.min_length = size;
4946 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004947 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004948
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004949 writer.pos = ascii_decode(s, end, writer.data);
4950 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 while (s < end) {
4952 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004953 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004954
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004956 if (PyUnicode_IS_ASCII(writer.buffer))
4957 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004958 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004959 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004961 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962 } else {
4963 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004964 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 }
4966
4967 switch (ch) {
4968 case 0:
4969 if (s == end || consumed)
4970 goto End;
4971 errmsg = "unexpected end of data";
4972 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004973 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 break;
4975 case 1:
4976 errmsg = "invalid start byte";
4977 startinpos = s - starts;
4978 endinpos = startinpos + 1;
4979 break;
4980 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004981 case 3:
4982 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004983 errmsg = "invalid continuation byte";
4984 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004985 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 break;
4987 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004988 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 goto onError;
4990 continue;
4991 }
4992
Victor Stinner1d65d912015-10-05 13:43:50 +02004993 if (error_handler == _Py_ERROR_UNKNOWN)
4994 error_handler = get_error_handler(errors);
4995
4996 switch (error_handler) {
4997 case _Py_ERROR_IGNORE:
4998 s += (endinpos - startinpos);
4999 break;
5000
5001 case _Py_ERROR_REPLACE:
5002 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5003 goto onError;
5004 s += (endinpos - startinpos);
5005 break;
5006
5007 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005008 {
5009 Py_ssize_t i;
5010
Victor Stinner1d65d912015-10-05 13:43:50 +02005011 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5012 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005013 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005014 ch = (Py_UCS4)(unsigned char)(starts[i]);
5015 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5016 ch + 0xdc00);
5017 writer.pos++;
5018 }
5019 s += (endinpos - startinpos);
5020 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005021 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005022
5023 default:
5024 if (unicode_decode_call_errorhandler_writer(
5025 errors, &error_handler_obj,
5026 "utf-8", errmsg,
5027 &starts, &end, &startinpos, &endinpos, &exc, &s,
5028 &writer))
5029 goto onError;
5030 }
Victor Stinner785938e2011-12-11 20:09:03 +01005031 }
5032
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005034 if (consumed)
5035 *consumed = s - starts;
5036
Victor Stinner1d65d912015-10-05 13:43:50 +02005037 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005039 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005040
5041onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005042 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005044 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005045 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005046}
5047
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005048#ifdef __APPLE__
5049
5050/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005051 used to decode the command line arguments on Mac OS X.
5052
5053 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005054 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005055
5056wchar_t*
5057_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5058{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005059 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 wchar_t *unicode;
5061 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005062
5063 /* Note: size will always be longer than the resulting Unicode
5064 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005065 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005066 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005067 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005068 if (!unicode)
5069 return NULL;
5070
5071 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005072 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005080#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 if (ch > 0xFF) {
5082#if SIZEOF_WCHAR_T == 4
5083 assert(0);
5084#else
5085 assert(Py_UNICODE_IS_SURROGATE(ch));
5086 /* compute and append the two surrogates: */
5087 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5088 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5089#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005090 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091 else {
5092 if (!ch && s == e)
5093 break;
5094 /* surrogateescape */
5095 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5096 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005097 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005099 return unicode;
5100}
5101
5102#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005104/* Primary internal function which creates utf8 encoded bytes objects.
5105
5106 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005107 and allocate exactly as much space needed at the end. Else allocate the
5108 maximum possible needed (4 result bytes per Unicode character), and return
5109 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005110*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005111PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005112_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113{
Victor Stinner6099a032011-12-18 14:22:26 +01005114 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 void *data;
5116 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005118 if (!PyUnicode_Check(unicode)) {
5119 PyErr_BadArgument();
5120 return NULL;
5121 }
5122
5123 if (PyUnicode_READY(unicode) == -1)
5124 return NULL;
5125
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005126 if (PyUnicode_UTF8(unicode))
5127 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5128 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005129
5130 kind = PyUnicode_KIND(unicode);
5131 data = PyUnicode_DATA(unicode);
5132 size = PyUnicode_GET_LENGTH(unicode);
5133
Benjamin Petersonead6b532011-12-20 17:23:42 -06005134 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005135 default:
5136 assert(0);
5137 case PyUnicode_1BYTE_KIND:
5138 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5139 assert(!PyUnicode_IS_ASCII(unicode));
5140 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5141 case PyUnicode_2BYTE_KIND:
5142 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5143 case PyUnicode_4BYTE_KIND:
5144 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146}
5147
Alexander Belopolsky40018472011-02-26 01:02:56 +00005148PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5150 Py_ssize_t size,
5151 const char *errors)
5152{
5153 PyObject *v, *unicode;
5154
5155 unicode = PyUnicode_FromUnicode(s, size);
5156 if (unicode == NULL)
5157 return NULL;
5158 v = _PyUnicode_AsUTF8String(unicode, errors);
5159 Py_DECREF(unicode);
5160 return v;
5161}
5162
5163PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005164PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167}
5168
Walter Dörwald41980ca2007-08-16 21:55:45 +00005169/* --- UTF-32 Codec ------------------------------------------------------- */
5170
5171PyObject *
5172PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 Py_ssize_t size,
5174 const char *errors,
5175 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005176{
5177 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5178}
5179
5180PyObject *
5181PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 Py_ssize_t size,
5183 const char *errors,
5184 int *byteorder,
5185 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005186{
5187 const char *starts = s;
5188 Py_ssize_t startinpos;
5189 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005190 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005191 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005192 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005193 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005194 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005195 PyObject *errorHandler = NULL;
5196 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005197
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198 q = (unsigned char *)s;
5199 e = q + size;
5200
5201 if (byteorder)
5202 bo = *byteorder;
5203
5204 /* Check for BOM marks (U+FEFF) in the input and adjust current
5205 byte order setting accordingly. In native mode, the leading BOM
5206 mark is skipped, in all other modes, it is copied to the output
5207 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005208 if (bo == 0 && size >= 4) {
5209 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5210 if (bom == 0x0000FEFF) {
5211 bo = -1;
5212 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005214 else if (bom == 0xFFFE0000) {
5215 bo = 1;
5216 q += 4;
5217 }
5218 if (byteorder)
5219 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005220 }
5221
Victor Stinnere64322e2012-10-30 23:12:47 +01005222 if (q == e) {
5223 if (consumed)
5224 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005225 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226 }
5227
Victor Stinnere64322e2012-10-30 23:12:47 +01005228#ifdef WORDS_BIGENDIAN
5229 le = bo < 0;
5230#else
5231 le = bo <= 0;
5232#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005233 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005234
Victor Stinner8f674cc2013-04-17 23:02:17 +02005235 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005236 writer.min_length = (e - q + 3) / 4;
5237 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005238 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005239
Victor Stinnere64322e2012-10-30 23:12:47 +01005240 while (1) {
5241 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005242 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005243
Victor Stinnere64322e2012-10-30 23:12:47 +01005244 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005245 enum PyUnicode_Kind kind = writer.kind;
5246 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005247 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005248 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 if (le) {
5250 do {
5251 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5252 if (ch > maxch)
5253 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005254 if (kind != PyUnicode_1BYTE_KIND &&
5255 Py_UNICODE_IS_SURROGATE(ch))
5256 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005257 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005258 q += 4;
5259 } while (q <= last);
5260 }
5261 else {
5262 do {
5263 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5264 if (ch > maxch)
5265 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005266 if (kind != PyUnicode_1BYTE_KIND &&
5267 Py_UNICODE_IS_SURROGATE(ch))
5268 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005269 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005270 q += 4;
5271 } while (q <= last);
5272 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005273 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005274 }
5275
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005276 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005277 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005278 startinpos = ((const char *)q) - starts;
5279 endinpos = startinpos + 4;
5280 }
5281 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005282 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005284 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005286 startinpos = ((const char *)q) - starts;
5287 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005289 else {
5290 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005291 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005292 goto onError;
5293 q += 4;
5294 continue;
5295 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005296 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 startinpos = ((const char *)q) - starts;
5298 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005300
5301 /* The remaining input chars are ignored if the callback
5302 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005303 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005305 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005309 }
5310
Walter Dörwald41980ca2007-08-16 21:55:45 +00005311 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314 Py_XDECREF(errorHandler);
5315 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005316 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005319 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005320 Py_XDECREF(errorHandler);
5321 Py_XDECREF(exc);
5322 return NULL;
5323}
5324
5325PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005326_PyUnicode_EncodeUTF32(PyObject *str,
5327 const char *errors,
5328 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005329{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005330 enum PyUnicode_Kind kind;
5331 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005332 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005333 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005334 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005335#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005336 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005337#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005338 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005339#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005341 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005342 PyObject *errorHandler = NULL;
5343 PyObject *exc = NULL;
5344 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005345
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005346 if (!PyUnicode_Check(str)) {
5347 PyErr_BadArgument();
5348 return NULL;
5349 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005350 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005351 return NULL;
5352 kind = PyUnicode_KIND(str);
5353 data = PyUnicode_DATA(str);
5354 len = PyUnicode_GET_LENGTH(str);
5355
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005356 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005357 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005358 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005359 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 if (v == NULL)
5361 return NULL;
5362
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005363 /* output buffer is 4-bytes aligned */
5364 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005365 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005366 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005367 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005368 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005369 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005371 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005372 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005373 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005374 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005375 else
5376 encoding = "utf-32";
5377
5378 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005379 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5380 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381 }
5382
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005383 pos = 0;
5384 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005385 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005386
5387 if (kind == PyUnicode_2BYTE_KIND) {
5388 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5389 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005390 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 else {
5392 assert(kind == PyUnicode_4BYTE_KIND);
5393 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5394 &out, native_ordering);
5395 }
5396 if (pos == len)
5397 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005398
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 rep = unicode_encode_call_errorhandler(
5400 errors, &errorHandler,
5401 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005402 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005403 if (!rep)
5404 goto error;
5405
5406 if (PyBytes_Check(rep)) {
5407 repsize = PyBytes_GET_SIZE(rep);
5408 if (repsize & 3) {
5409 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005411 "surrogates not allowed");
5412 goto error;
5413 }
5414 moreunits = repsize / 4;
5415 }
5416 else {
5417 assert(PyUnicode_Check(rep));
5418 if (PyUnicode_READY(rep) < 0)
5419 goto error;
5420 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5421 if (!PyUnicode_IS_ASCII(rep)) {
5422 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 "surrogates not allowed");
5425 goto error;
5426 }
5427 }
5428
5429 /* four bytes are reserved for each surrogate */
5430 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005431 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 Py_ssize_t morebytes = 4 * (moreunits - 1);
5433 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5434 /* integer overflow */
5435 PyErr_NoMemory();
5436 goto error;
5437 }
5438 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5439 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005440 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 }
5442
5443 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005444 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5445 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005447 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5449 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 }
5451
5452 Py_CLEAR(rep);
5453 }
5454
5455 /* Cut back to size actually needed. This is necessary for, for example,
5456 encoding of a string containing isolated surrogates and the 'ignore'
5457 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005458 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 if (nsize != PyBytes_GET_SIZE(v))
5460 _PyBytes_Resize(&v, nsize);
5461 Py_XDECREF(errorHandler);
5462 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005463 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005464 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 error:
5466 Py_XDECREF(rep);
5467 Py_XDECREF(errorHandler);
5468 Py_XDECREF(exc);
5469 Py_XDECREF(v);
5470 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005471}
5472
Alexander Belopolsky40018472011-02-26 01:02:56 +00005473PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005474PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5475 Py_ssize_t size,
5476 const char *errors,
5477 int byteorder)
5478{
5479 PyObject *result;
5480 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5481 if (tmp == NULL)
5482 return NULL;
5483 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5484 Py_DECREF(tmp);
5485 return result;
5486}
5487
5488PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005489PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005490{
Victor Stinnerb960b342011-11-20 19:12:52 +01005491 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005492}
5493
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494/* --- UTF-16 Codec ------------------------------------------------------- */
5495
Tim Peters772747b2001-08-09 22:21:55 +00005496PyObject *
5497PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 Py_ssize_t size,
5499 const char *errors,
5500 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501{
Walter Dörwald69652032004-09-07 20:24:22 +00005502 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5503}
5504
5505PyObject *
5506PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 Py_ssize_t size,
5508 const char *errors,
5509 int *byteorder,
5510 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005511{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005513 Py_ssize_t startinpos;
5514 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005515 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005516 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005517 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005518 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005519 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520 PyObject *errorHandler = NULL;
5521 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005522 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523
Tim Peters772747b2001-08-09 22:21:55 +00005524 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005525 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
5527 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005528 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005530 /* Check for BOM marks (U+FEFF) in the input and adjust current
5531 byte order setting accordingly. In native mode, the leading BOM
5532 mark is skipped, in all other modes, it is copied to the output
5533 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005534 if (bo == 0 && size >= 2) {
5535 const Py_UCS4 bom = (q[1] << 8) | q[0];
5536 if (bom == 0xFEFF) {
5537 q += 2;
5538 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005540 else if (bom == 0xFFFE) {
5541 q += 2;
5542 bo = 1;
5543 }
5544 if (byteorder)
5545 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547
Antoine Pitrou63065d72012-05-15 23:48:04 +02005548 if (q == e) {
5549 if (consumed)
5550 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005551 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005552 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005553
Christian Heimes743e0cd2012-10-17 23:52:17 +02005554#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005555 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005557#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005558 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005559 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005560#endif
Tim Peters772747b2001-08-09 22:21:55 +00005561
Antoine Pitrou63065d72012-05-15 23:48:04 +02005562 /* Note: size will always be longer than the resulting Unicode
5563 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005564 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005565 writer.min_length = (e - q + 1) / 2;
5566 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005567 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005568
Antoine Pitrou63065d72012-05-15 23:48:04 +02005569 while (1) {
5570 Py_UCS4 ch = 0;
5571 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005572 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005574 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005575 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005576 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005577 native_ordering);
5578 else
5579 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005580 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581 native_ordering);
5582 } else if (kind == PyUnicode_2BYTE_KIND) {
5583 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005584 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005585 native_ordering);
5586 } else {
5587 assert(kind == PyUnicode_4BYTE_KIND);
5588 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005589 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005591 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005592 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593
Antoine Pitrou63065d72012-05-15 23:48:04 +02005594 switch (ch)
5595 {
5596 case 0:
5597 /* remaining byte at the end? (size should be even) */
5598 if (q == e || consumed)
5599 goto End;
5600 errmsg = "truncated data";
5601 startinpos = ((const char *)q) - starts;
5602 endinpos = ((const char *)e) - starts;
5603 break;
5604 /* The remaining input chars are ignored if the callback
5605 chooses to skip the input */
5606 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005607 q -= 2;
5608 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005609 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005610 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005611 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 endinpos = ((const char *)e) - starts;
5613 break;
5614 case 2:
5615 errmsg = "illegal encoding";
5616 startinpos = ((const char *)q) - 2 - starts;
5617 endinpos = startinpos + 2;
5618 break;
5619 case 3:
5620 errmsg = "illegal UTF-16 surrogate";
5621 startinpos = ((const char *)q) - 4 - starts;
5622 endinpos = startinpos + 2;
5623 break;
5624 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005625 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005626 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 continue;
5628 }
5629
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005630 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005631 errors,
5632 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005633 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005634 &starts,
5635 (const char **)&e,
5636 &startinpos,
5637 &endinpos,
5638 &exc,
5639 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005640 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 }
5643
Antoine Pitrou63065d72012-05-15 23:48:04 +02005644End:
Walter Dörwald69652032004-09-07 20:24:22 +00005645 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005647
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 Py_XDECREF(errorHandler);
5649 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005650 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005653 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 Py_XDECREF(errorHandler);
5655 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 return NULL;
5657}
5658
Tim Peters772747b2001-08-09 22:21:55 +00005659PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660_PyUnicode_EncodeUTF16(PyObject *str,
5661 const char *errors,
5662 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005664 enum PyUnicode_Kind kind;
5665 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005666 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005667 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005668 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005669 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005670#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005671 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005672#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005673 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005674#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005675 const char *encoding;
5676 Py_ssize_t nsize, pos;
5677 PyObject *errorHandler = NULL;
5678 PyObject *exc = NULL;
5679 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005680
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005681 if (!PyUnicode_Check(str)) {
5682 PyErr_BadArgument();
5683 return NULL;
5684 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005685 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005686 return NULL;
5687 kind = PyUnicode_KIND(str);
5688 data = PyUnicode_DATA(str);
5689 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005690
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005691 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005692 if (kind == PyUnicode_4BYTE_KIND) {
5693 const Py_UCS4 *in = (const Py_UCS4 *)data;
5694 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005695 while (in < end) {
5696 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005697 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005698 }
5699 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005700 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005701 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005703 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005704 nsize = len + pairs + (byteorder == 0);
5705 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005706 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005710 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005711 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005712 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005713 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005714 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005715 }
5716 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005717 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005718 }
Tim Peters772747b2001-08-09 22:21:55 +00005719
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005720 if (kind == PyUnicode_1BYTE_KIND) {
5721 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5722 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005723 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005724
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005725 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005727 }
5728 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005729 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005730 }
5731 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005734
5735 pos = 0;
5736 while (pos < len) {
5737 Py_ssize_t repsize, moreunits;
5738
5739 if (kind == PyUnicode_2BYTE_KIND) {
5740 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5741 &out, native_ordering);
5742 }
5743 else {
5744 assert(kind == PyUnicode_4BYTE_KIND);
5745 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5746 &out, native_ordering);
5747 }
5748 if (pos == len)
5749 break;
5750
5751 rep = unicode_encode_call_errorhandler(
5752 errors, &errorHandler,
5753 encoding, "surrogates not allowed",
5754 str, &exc, pos, pos + 1, &pos);
5755 if (!rep)
5756 goto error;
5757
5758 if (PyBytes_Check(rep)) {
5759 repsize = PyBytes_GET_SIZE(rep);
5760 if (repsize & 1) {
5761 raise_encode_exception(&exc, encoding,
5762 str, pos - 1, pos,
5763 "surrogates not allowed");
5764 goto error;
5765 }
5766 moreunits = repsize / 2;
5767 }
5768 else {
5769 assert(PyUnicode_Check(rep));
5770 if (PyUnicode_READY(rep) < 0)
5771 goto error;
5772 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5773 if (!PyUnicode_IS_ASCII(rep)) {
5774 raise_encode_exception(&exc, encoding,
5775 str, pos - 1, pos,
5776 "surrogates not allowed");
5777 goto error;
5778 }
5779 }
5780
5781 /* two bytes are reserved for each surrogate */
5782 if (moreunits > 1) {
5783 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5784 Py_ssize_t morebytes = 2 * (moreunits - 1);
5785 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5786 /* integer overflow */
5787 PyErr_NoMemory();
5788 goto error;
5789 }
5790 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5791 goto error;
5792 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5793 }
5794
5795 if (PyBytes_Check(rep)) {
5796 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5797 out += moreunits;
5798 } else /* rep is unicode */ {
5799 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5800 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5801 &out, native_ordering);
5802 }
5803
5804 Py_CLEAR(rep);
5805 }
5806
5807 /* Cut back to size actually needed. This is necessary for, for example,
5808 encoding of a string containing isolated surrogates and the 'ignore' handler
5809 is used. */
5810 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5811 if (nsize != PyBytes_GET_SIZE(v))
5812 _PyBytes_Resize(&v, nsize);
5813 Py_XDECREF(errorHandler);
5814 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005815 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005817 error:
5818 Py_XDECREF(rep);
5819 Py_XDECREF(errorHandler);
5820 Py_XDECREF(exc);
5821 Py_XDECREF(v);
5822 return NULL;
5823#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824}
5825
Alexander Belopolsky40018472011-02-26 01:02:56 +00005826PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005827PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5828 Py_ssize_t size,
5829 const char *errors,
5830 int byteorder)
5831{
5832 PyObject *result;
5833 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5834 if (tmp == NULL)
5835 return NULL;
5836 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5837 Py_DECREF(tmp);
5838 return result;
5839}
5840
5841PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005842PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005844 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845}
5846
5847/* --- Unicode Escape Codec ----------------------------------------------- */
5848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005849/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5850 if all the escapes in the string make it still a valid ASCII string.
5851 Returns -1 if any escapes were found which cause the string to
5852 pop out of ASCII range. Otherwise returns the length of the
5853 required buffer to hold the string.
5854 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005855static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005856length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5857{
5858 const unsigned char *p = (const unsigned char *)s;
5859 const unsigned char *end = p + size;
5860 Py_ssize_t length = 0;
5861
5862 if (size < 0)
5863 return -1;
5864
5865 for (; p < end; ++p) {
5866 if (*p > 127) {
5867 /* Non-ASCII */
5868 return -1;
5869 }
5870 else if (*p != '\\') {
5871 /* Normal character */
5872 ++length;
5873 }
5874 else {
5875 /* Backslash-escape, check next char */
5876 ++p;
5877 /* Escape sequence reaches till end of string or
5878 non-ASCII follow-up. */
5879 if (p >= end || *p > 127)
5880 return -1;
5881 switch (*p) {
5882 case '\n':
5883 /* backslash + \n result in zero characters */
5884 break;
5885 case '\\': case '\'': case '\"':
5886 case 'b': case 'f': case 't':
5887 case 'n': case 'r': case 'v': case 'a':
5888 ++length;
5889 break;
5890 case '0': case '1': case '2': case '3':
5891 case '4': case '5': case '6': case '7':
5892 case 'x': case 'u': case 'U': case 'N':
5893 /* these do not guarantee ASCII characters */
5894 return -1;
5895 default:
5896 /* count the backslash + the other character */
5897 length += 2;
5898 }
5899 }
5900 }
5901 return length;
5902}
5903
Fredrik Lundh06d12682001-01-24 07:59:11 +00005904static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005905
Alexander Belopolsky40018472011-02-26 01:02:56 +00005906PyObject *
5907PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005908 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005909 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005912 Py_ssize_t startinpos;
5913 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005914 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005916 char* message;
5917 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 PyObject *errorHandler = NULL;
5919 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005920 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005921
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005922 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005923 if (len == 0)
5924 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005925
5926 /* After length_of_escaped_ascii_string() there are two alternatives,
5927 either the string is pure ASCII with named escapes like \n, etc.
5928 and we determined it's exact size (common case)
5929 or it contains \x, \u, ... escape sequences. then we create a
5930 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005931 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005932 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005933 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934 }
5935 else {
5936 /* Escaped strings will always be longer than the resulting
5937 Unicode string, so we start with size here and then reduce the
5938 length after conversion to the true value.
5939 (but if the error callback returns a long replacement string
5940 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005941 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005942 }
5943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005945 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 while (s < end) {
5949 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005950 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
5953 /* Non-escape characters are interpreted as Unicode ordinals */
5954 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005955 x = (unsigned char)*s;
5956 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005957 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005958 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 continue;
5960 }
5961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005962 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 /* \ - Escapes */
5964 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005965 c = *s++;
5966 if (s > end)
5967 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005969 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005972#define WRITECHAR(ch) \
5973 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005974 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005975 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005976 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005979 case '\\': WRITECHAR('\\'); break;
5980 case '\'': WRITECHAR('\''); break;
5981 case '\"': WRITECHAR('\"'); break;
5982 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005984 case 'f': WRITECHAR('\014'); break;
5985 case 't': WRITECHAR('\t'); break;
5986 case 'n': WRITECHAR('\n'); break;
5987 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005989 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005991 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 case '0': case '1': case '2': case '3':
5995 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005996 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005997 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005998 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005999 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006000 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006002 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 break;
6004
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 /* hex escapes */
6006 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006008 digits = 2;
6009 message = "truncated \\xXX escape";
6010 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006014 digits = 4;
6015 message = "truncated \\uXXXX escape";
6016 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006019 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006020 digits = 8;
6021 message = "truncated \\UXXXXXXXX escape";
6022 hexescape:
6023 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006024 if (end - s < digits) {
6025 /* count only hex digits */
6026 for (; s < end; ++s) {
6027 c = (unsigned char)*s;
6028 if (!Py_ISXDIGIT(c))
6029 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006030 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006031 goto error;
6032 }
6033 for (; digits--; ++s) {
6034 c = (unsigned char)*s;
6035 if (!Py_ISXDIGIT(c))
6036 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006037 chr = (chr<<4) & ~0xF;
6038 if (c >= '0' && c <= '9')
6039 chr += c - '0';
6040 else if (c >= 'a' && c <= 'f')
6041 chr += 10 + c - 'a';
6042 else
6043 chr += 10 + c - 'A';
6044 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006045 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 /* _decoding_error will have already written into the
6047 target buffer. */
6048 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006050 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006051 message = "illegal Unicode character";
6052 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02006053 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006054 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 break;
6056
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 case 'N':
6059 message = "malformed \\N character escape";
6060 if (ucnhash_CAPI == NULL) {
6061 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006062 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6063 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006064 if (ucnhash_CAPI == NULL)
6065 goto ucnhashError;
6066 }
6067 if (*s == '{') {
6068 const char *start = s+1;
6069 /* look for the closing brace */
6070 while (*s != '}' && s < end)
6071 s++;
6072 if (s > start && s < end && *s == '}') {
6073 /* found a name. look it up in the unicode database */
6074 message = "unknown Unicode character name";
6075 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02006076 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02006077 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006078 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006079 goto store;
6080 }
6081 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006082 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006083
6084 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006085 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 message = "\\ at end of string";
6087 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006088 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006089 }
6090 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006091 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006092 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006093 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006096 continue;
6097
6098 error:
6099 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006100 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006101 errors, &errorHandler,
6102 "unicodeescape", message,
6103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006104 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006105 goto onError;
6106 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006108#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006109
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006110 Py_XDECREF(errorHandler);
6111 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006112 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006113
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006115 PyErr_SetString(
6116 PyExc_UnicodeError,
6117 "\\N escapes not supported (can't load unicodedata module)"
6118 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006119 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006122 return NULL;
6123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129}
6130
6131/* Return a Unicode-Escape string version of the Unicode object.
6132
6133 If quotes is true, the string is enclosed in u"" or u'' quotes as
6134 appropriate.
6135
6136*/
6137
Alexander Belopolsky40018472011-02-26 01:02:56 +00006138PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143 int kind;
6144 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006145 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
Ezio Melottie7f90372012-10-05 03:33:31 +03006147 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006148 escape.
6149
Ezio Melottie7f90372012-10-05 03:33:31 +03006150 For UCS1 strings it's '\xxx', 4 bytes per source character.
6151 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6152 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006153 */
6154
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 if (!PyUnicode_Check(unicode)) {
6156 PyErr_BadArgument();
6157 return NULL;
6158 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006159 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006161
6162 _PyBytesWriter_Init(&writer);
6163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 len = PyUnicode_GET_LENGTH(unicode);
6165 kind = PyUnicode_KIND(unicode);
6166 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167
Victor Stinner358af132015-10-12 22:36:57 +02006168 p = _PyBytesWriter_Alloc(&writer, len);
6169 if (p == NULL)
6170 goto error;
6171 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006174 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006175
Walter Dörwald79e913e2007-05-12 11:08:06 +00006176 /* Escape backslashes */
6177 if (ch == '\\') {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006178 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006179 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6180 if (p == NULL)
6181 goto error;
6182
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 *p++ = '\\';
6184 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006185 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006186 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006187
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006188 /* Map 21-bit characters to '\U00xxxxxx' */
6189 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006190 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006191
6192 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6193 if (p == NULL)
6194 goto error;
6195
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006196 *p++ = '\\';
6197 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006198 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6199 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6200 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6201 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6202 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6203 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6204 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6205 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006207 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006210 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006211 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6212 if (p == NULL)
6213 goto error;
6214
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 *p++ = '\\';
6216 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006217 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6218 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6219 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006222
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006223 /* Map special whitespace to '\t', \n', '\r' */
6224 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006225 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6226 if (p == NULL)
6227 goto error;
6228
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006229 *p++ = '\\';
6230 *p++ = 't';
6231 }
6232 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006233 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6234 if (p == NULL)
6235 goto error;
6236
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006237 *p++ = '\\';
6238 *p++ = 'n';
6239 }
6240 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006241 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6242 if (p == NULL)
6243 goto error;
6244
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006245 *p++ = '\\';
6246 *p++ = 'r';
6247 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006248
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006249 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006250 else if (ch < ' ' || ch >= 0x7F) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006251 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006252 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6253 if (p == NULL)
6254 goto error;
6255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006257 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006258 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6259 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006260 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006261
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 /* Copy everything else as-is */
6263 else
6264 *p++ = (char) ch;
6265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
Victor Stinner358af132015-10-12 22:36:57 +02006267 return _PyBytesWriter_Finish(&writer, p);
6268
6269error:
6270 _PyBytesWriter_Dealloc(&writer);
6271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272}
6273
Alexander Belopolsky40018472011-02-26 01:02:56 +00006274PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006275PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6276 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 PyObject *result;
6279 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6280 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006282 result = PyUnicode_AsUnicodeEscapeString(tmp);
6283 Py_DECREF(tmp);
6284 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285}
6286
6287/* --- Raw Unicode Escape Codec ------------------------------------------- */
6288
Alexander Belopolsky40018472011-02-26 01:02:56 +00006289PyObject *
6290PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006291 Py_ssize_t size,
6292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006295 Py_ssize_t startinpos;
6296 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006297 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 const char *end;
6299 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 PyObject *errorHandler = NULL;
6301 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006302
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006303 if (size == 0)
6304 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 /* Escaped strings will always be longer than the resulting
6307 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006308 length after conversion to the true value. (But decoding error
6309 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006310 _PyUnicodeWriter_Init(&writer);
6311 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006312
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 end = s + size;
6314 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 unsigned char c;
6316 Py_UCS4 x;
6317 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006318 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 /* Non-escape characters are interpreted as Unicode ordinals */
6321 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006322 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006323 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006324 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006326 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 startinpos = s-starts;
6328
6329 /* \u-escapes are only interpreted iff the number of leading
6330 backslashes if odd */
6331 bs = s;
6332 for (;s < end;) {
6333 if (*s != '\\')
6334 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006335 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006336 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006337 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 }
6339 if (((s - bs) & 1) == 0 ||
6340 s >= end ||
6341 (*s != 'u' && *s != 'U')) {
6342 continue;
6343 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006344 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 count = *s=='u' ? 4 : 8;
6346 s++;
6347
6348 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 for (x = 0, i = 0; i < count; ++i, ++s) {
6350 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006351 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006353 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 errors, &errorHandler,
6355 "rawunicodeescape", "truncated \\uXXXX",
6356 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006357 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 goto onError;
6359 goto nextByte;
6360 }
6361 x = (x<<4) & ~0xF;
6362 if (c >= '0' && c <= '9')
6363 x += c - '0';
6364 else if (c >= 'a' && c <= 'f')
6365 x += 10 + c - 'a';
6366 else
6367 x += 10 + c - 'A';
6368 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006369 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006370 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006371 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006372 }
6373 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006374 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006375 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006376 errors, &errorHandler,
6377 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006379 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006381 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 nextByte:
6383 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 Py_XDECREF(errorHandler);
6386 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006387 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006388
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006390 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 Py_XDECREF(errorHandler);
6392 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 return NULL;
6394}
6395
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396
Alexander Belopolsky40018472011-02-26 01:02:56 +00006397PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006401 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402 int kind;
6403 void *data;
6404 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006405 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407 if (!PyUnicode_Check(unicode)) {
6408 PyErr_BadArgument();
6409 return NULL;
6410 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006411 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006413
6414 _PyBytesWriter_Init(&writer);
6415
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416 kind = PyUnicode_KIND(unicode);
6417 data = PyUnicode_DATA(unicode);
6418 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006419
Victor Stinner358af132015-10-12 22:36:57 +02006420 p = _PyBytesWriter_Alloc(&writer, len);
6421 if (p == NULL)
6422 goto error;
6423 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006424
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425 for (pos = 0; pos < len; pos++) {
6426 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 /* Map 32-bit characters to '\Uxxxxxxxx' */
6428 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006429 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006430
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006431 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006432 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6433 if (p == NULL)
6434 goto error;
6435
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006436 *p++ = '\\';
6437 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006438 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6439 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6440 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6441 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6442 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6443 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6444 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6445 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006446 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006448 else if (ch >= 256) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006449 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006450 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6451 if (p == NULL)
6452 goto error;
6453
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 *p++ = '\\';
6455 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006456 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6459 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 /* Copy everything else as-is */
6462 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 *p++ = (char) ch;
6464 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006465
Victor Stinner358af132015-10-12 22:36:57 +02006466 return _PyBytesWriter_Finish(&writer, p);
6467
6468error:
6469 _PyBytesWriter_Dealloc(&writer);
6470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471}
6472
Alexander Belopolsky40018472011-02-26 01:02:56 +00006473PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006474PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6475 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006477 PyObject *result;
6478 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6479 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006480 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006481 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6482 Py_DECREF(tmp);
6483 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484}
6485
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006486/* --- Unicode Internal Codec ------------------------------------------- */
6487
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
6489_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006490 Py_ssize_t size,
6491 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006492{
6493 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006494 Py_ssize_t startinpos;
6495 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006496 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006497 const char *end;
6498 const char *reason;
6499 PyObject *errorHandler = NULL;
6500 PyObject *exc = NULL;
6501
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006502 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006503 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006504 1))
6505 return NULL;
6506
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006507 if (size == 0)
6508 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006509
Victor Stinner8f674cc2013-04-17 23:02:17 +02006510 _PyUnicodeWriter_Init(&writer);
6511 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6512 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006514 }
6515 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006516
Victor Stinner8f674cc2013-04-17 23:02:17 +02006517 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006518 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006519 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006520 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006521 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006522 endinpos = end-starts;
6523 reason = "truncated input";
6524 goto error;
6525 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006526 /* We copy the raw representation one byte at a time because the
6527 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006528 ((char *) &uch)[0] = s[0];
6529 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006530#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006531 ((char *) &uch)[2] = s[2];
6532 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006533#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006534 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006535#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006536 /* We have to sanity check the raw data, otherwise doom looms for
6537 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006538 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006539 endinpos = s - starts + Py_UNICODE_SIZE;
6540 reason = "illegal code point (> 0x10FFFF)";
6541 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006542 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006543#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006544 s += Py_UNICODE_SIZE;
6545#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006546 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006547 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006548 Py_UNICODE uch2;
6549 ((char *) &uch2)[0] = s[0];
6550 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006551 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006552 {
Victor Stinner551ac952011-11-29 22:58:13 +01006553 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 }
6556 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006557#endif
6558
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006559 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006560 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006561 continue;
6562
6563 error:
6564 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006565 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006566 errors, &errorHandler,
6567 "unicode_internal", reason,
6568 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006569 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006570 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006571 }
6572
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006573 Py_XDECREF(errorHandler);
6574 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006575 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006576
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006578 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006579 Py_XDECREF(errorHandler);
6580 Py_XDECREF(exc);
6581 return NULL;
6582}
6583
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584/* --- Latin-1 Codec ------------------------------------------------------ */
6585
Alexander Belopolsky40018472011-02-26 01:02:56 +00006586PyObject *
6587PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006588 Py_ssize_t size,
6589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006592 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593}
6594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596static void
6597make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006598 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006599 PyObject *unicode,
6600 Py_ssize_t startpos, Py_ssize_t endpos,
6601 const char *reason)
6602{
6603 if (*exceptionObject == NULL) {
6604 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006606 encoding, unicode, startpos, endpos, reason);
6607 }
6608 else {
6609 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6610 goto onError;
6611 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6612 goto onError;
6613 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6614 goto onError;
6615 return;
6616 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006617 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006618 }
6619}
6620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006621/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006622static void
6623raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006624 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006625 PyObject *unicode,
6626 Py_ssize_t startpos, Py_ssize_t endpos,
6627 const char *reason)
6628{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006629 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006630 encoding, unicode, startpos, endpos, reason);
6631 if (*exceptionObject != NULL)
6632 PyCodec_StrictErrors(*exceptionObject);
6633}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006634
6635/* error handling callback helper:
6636 build arguments, call the callback and check the arguments,
6637 put the result into newpos and return the replacement string, which
6638 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006639static PyObject *
6640unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006641 PyObject **errorHandler,
6642 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006644 Py_ssize_t startpos, Py_ssize_t endpos,
6645 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006647 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649 PyObject *restuple;
6650 PyObject *resunicode;
6651
6652 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656 }
6657
Benjamin Petersonbac79492012-01-14 13:34:47 -05006658 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 return NULL;
6660 len = PyUnicode_GET_LENGTH(unicode);
6661
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006662 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666
6667 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006672 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 Py_DECREF(restuple);
6674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006676 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 &resunicode, newpos)) {
6678 Py_DECREF(restuple);
6679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006681 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6682 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6683 Py_DECREF(restuple);
6684 return NULL;
6685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006687 *newpos = len + *newpos;
6688 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006689 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 Py_DECREF(restuple);
6691 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006692 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 Py_INCREF(resunicode);
6694 Py_DECREF(restuple);
6695 return resunicode;
6696}
6697
Alexander Belopolsky40018472011-02-26 01:02:56 +00006698static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006699unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006700 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006701 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703 /* input state */
6704 Py_ssize_t pos=0, size;
6705 int kind;
6706 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 /* pointer into the output */
6708 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006709 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6710 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006711 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006713 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006714 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006715 /* output object */
6716 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717
Benjamin Petersonbac79492012-01-14 13:34:47 -05006718 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 return NULL;
6720 size = PyUnicode_GET_LENGTH(unicode);
6721 kind = PyUnicode_KIND(unicode);
6722 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 /* allocate enough for a simple encoding without
6724 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006725 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006726 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006727
6728 _PyBytesWriter_Init(&writer);
6729 str = _PyBytesWriter_Alloc(&writer, size);
6730 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006734 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006737 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006739 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006743 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006745 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006746 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006748
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006749 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006751
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006752 /* Only overallocate the buffer if it's not the last write */
6753 writer.overallocate = (collend < size);
6754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006756 if (error_handler == _Py_ERROR_UNKNOWN)
6757 error_handler = get_error_handler(errors);
6758
6759 switch (error_handler) {
6760 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006761 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006763
6764 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006765 memset(str, '?', collend - collstart);
6766 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006767 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006768 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 break;
Victor Stinner50149202015-09-22 00:26:54 +02006771
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006772 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006773 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006774 writer.min_size -= (collend - collstart);
6775 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006776 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006777 if (str == NULL)
6778 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006779 pos = collend;
6780 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006781
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006782 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006783 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006784 writer.min_size -= (collend - collstart);
6785 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006786 unicode, collstart, collend);
6787 if (str == NULL)
6788 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006789 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 break;
Victor Stinner50149202015-09-22 00:26:54 +02006791
Victor Stinnerc3713e92015-09-29 12:32:13 +02006792 case _Py_ERROR_SURROGATEESCAPE:
6793 for (i = collstart; i < collend; ++i) {
6794 ch = PyUnicode_READ(kind, data, i);
6795 if (ch < 0xdc80 || 0xdcff < ch) {
6796 /* Not a UTF-8b surrogate */
6797 break;
6798 }
6799 *str++ = (char)(ch - 0xdc00);
6800 ++pos;
6801 }
6802 if (i >= collend)
6803 break;
6804 collstart = pos;
6805 assert(collstart != collend);
6806 /* fallback to general error handling */
6807
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006809 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6810 encoding, reason, unicode, &exc,
6811 collstart, collend, &newpos);
6812 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006814
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006815 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006816 writer.min_size -= 1;
6817
Victor Stinner6bd525b2015-10-09 13:10:05 +02006818 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006819 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006820 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006821 PyBytes_AS_STRING(rep),
6822 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006823 if (str == NULL)
6824 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006825 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006826 else {
6827 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006828
Victor Stinner6bd525b2015-10-09 13:10:05 +02006829 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006831
6832 if (PyUnicode_IS_ASCII(rep)) {
6833 /* Fast path: all characters are smaller than limit */
6834 assert(limit >= 128);
6835 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6836 str = _PyBytesWriter_WriteBytes(&writer, str,
6837 PyUnicode_DATA(rep),
6838 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 else {
6841 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6842
6843 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6844 if (str == NULL)
6845 goto onError;
6846
6847 /* check if there is anything unencodable in the
6848 replacement and copy it to the output */
6849 for (i = 0; repsize-->0; ++i, ++str) {
6850 ch = PyUnicode_READ_CHAR(rep, i);
6851 if (ch >= limit) {
6852 raise_encode_exception(&exc, encoding, unicode,
6853 pos, pos+1, reason);
6854 goto onError;
6855 }
6856 *str = (char)ch;
6857 }
6858 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006860 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006861 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863
6864 /* If overallocation was disabled, ensure that it was the last
6865 write. Otherwise, we missed an optimization */
6866 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
6868 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006869
Victor Stinner50149202015-09-22 00:26:54 +02006870 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006872 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006873
6874 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006875 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006876 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006877 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006878 Py_XDECREF(exc);
6879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880}
6881
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006882/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006883PyObject *
6884PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006885 Py_ssize_t size,
6886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888 PyObject *result;
6889 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6890 if (unicode == NULL)
6891 return NULL;
6892 result = unicode_encode_ucs1(unicode, errors, 256);
6893 Py_DECREF(unicode);
6894 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895}
6896
Alexander Belopolsky40018472011-02-26 01:02:56 +00006897PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006898_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
6900 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 PyErr_BadArgument();
6902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904 if (PyUnicode_READY(unicode) == -1)
6905 return NULL;
6906 /* Fast path: if it is a one-byte string, construct
6907 bytes object directly. */
6908 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6909 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6910 PyUnicode_GET_LENGTH(unicode));
6911 /* Non-Latin-1 characters present. Defer to above function to
6912 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006914}
6915
6916PyObject*
6917PyUnicode_AsLatin1String(PyObject *unicode)
6918{
6919 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
6922/* --- 7-bit ASCII Codec -------------------------------------------------- */
6923
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924PyObject *
6925PyUnicode_DecodeASCII(const char *s,
6926 Py_ssize_t size,
6927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006930 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006931 int kind;
6932 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006933 Py_ssize_t startinpos;
6934 Py_ssize_t endinpos;
6935 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006939 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006940
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006942 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006943
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006945 if (size == 1 && (unsigned char)s[0] < 128)
6946 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006947
Victor Stinner8f674cc2013-04-17 23:02:17 +02006948 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006949 writer.min_length = size;
6950 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006951 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006954 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006955 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006956 writer.pos = outpos;
6957 if (writer.pos == size)
6958 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006959
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 s += writer.pos;
6961 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006963 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 PyUnicode_WRITE(kind, data, writer.pos, c);
6966 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006968 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970
6971 /* byte outsize range 0x00..0x7f: call the error handler */
6972
6973 if (error_handler == _Py_ERROR_UNKNOWN)
6974 error_handler = get_error_handler(errors);
6975
6976 switch (error_handler)
6977 {
6978 case _Py_ERROR_REPLACE:
6979 case _Py_ERROR_SURROGATEESCAPE:
6980 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006981 but we may switch to UCS2 at the first write */
6982 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6983 goto onError;
6984 kind = writer.kind;
6985 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006986
6987 if (error_handler == _Py_ERROR_REPLACE)
6988 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6989 else
6990 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6991 writer.pos++;
6992 ++s;
6993 break;
6994
6995 case _Py_ERROR_IGNORE:
6996 ++s;
6997 break;
6998
6999 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 startinpos = s-starts;
7001 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007002 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007003 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 "ascii", "ordinal not in range(128)",
7005 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007006 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 kind = writer.kind;
7009 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007012 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007015
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007017 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 return NULL;
7021}
7022
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007024PyObject *
7025PyUnicode_EncodeASCII(const Py_UNICODE *p,
7026 Py_ssize_t size,
7027 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029 PyObject *result;
7030 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7031 if (unicode == NULL)
7032 return NULL;
7033 result = unicode_encode_ucs1(unicode, errors, 128);
7034 Py_DECREF(unicode);
7035 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
Alexander Belopolsky40018472011-02-26 01:02:56 +00007038PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007039_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040{
7041 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 PyErr_BadArgument();
7043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007045 if (PyUnicode_READY(unicode) == -1)
7046 return NULL;
7047 /* Fast path: if it is an ASCII-only string, construct bytes object
7048 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007049 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7051 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007052 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007053}
7054
7055PyObject *
7056PyUnicode_AsASCIIString(PyObject *unicode)
7057{
7058 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059}
7060
Victor Stinner99b95382011-07-04 14:23:54 +02007061#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007062
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007063/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007064
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007065#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066#define NEED_RETRY
7067#endif
7068
Victor Stinner3a50e702011-10-18 21:21:00 +02007069#ifndef WC_ERR_INVALID_CHARS
7070# define WC_ERR_INVALID_CHARS 0x0080
7071#endif
7072
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007073static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007074code_page_name(UINT code_page, PyObject **obj)
7075{
7076 *obj = NULL;
7077 if (code_page == CP_ACP)
7078 return "mbcs";
7079 if (code_page == CP_UTF7)
7080 return "CP_UTF7";
7081 if (code_page == CP_UTF8)
7082 return "CP_UTF8";
7083
7084 *obj = PyBytes_FromFormat("cp%u", code_page);
7085 if (*obj == NULL)
7086 return NULL;
7087 return PyBytes_AS_STRING(*obj);
7088}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090static DWORD
7091decode_code_page_flags(UINT code_page)
7092{
7093 if (code_page == CP_UTF7) {
7094 /* The CP_UTF7 decoder only supports flags=0 */
7095 return 0;
7096 }
7097 else
7098 return MB_ERR_INVALID_CHARS;
7099}
7100
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 * Decode a byte string from a Windows code page into unicode object in strict
7103 * mode.
7104 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007105 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7106 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007108static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007109decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007110 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 const char *in,
7112 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113{
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007115 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
7118 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 assert(insize > 0);
7120 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7121 if (outsize <= 0)
7122 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
7124 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007126 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007127 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 if (*v == NULL)
7129 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 }
7132 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007134 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007135 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138 }
7139
7140 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7142 if (outsize <= 0)
7143 goto error;
7144 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007145
Victor Stinner3a50e702011-10-18 21:21:00 +02007146error:
7147 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7148 return -2;
7149 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007150 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151}
7152
Victor Stinner3a50e702011-10-18 21:21:00 +02007153/*
7154 * Decode a byte string from a code page into unicode object with an error
7155 * handler.
7156 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007157 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 * UnicodeDecodeError exception and returns -1 on error.
7159 */
7160static int
7161decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007162 PyObject **v,
7163 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007164 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007165{
7166 const char *startin = in;
7167 const char *endin = in + size;
7168 const DWORD flags = decode_code_page_flags(code_page);
7169 /* Ideally, we should get reason from FormatMessage. This is the Windows
7170 2000 English version of the message. */
7171 const char *reason = "No mapping for the Unicode character exists "
7172 "in the target code page.";
7173 /* each step cannot decode more than 1 character, but a character can be
7174 represented as a surrogate pair */
7175 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007176 int insize;
7177 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 PyObject *errorHandler = NULL;
7179 PyObject *exc = NULL;
7180 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007181 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 DWORD err;
7183 int ret = -1;
7184
7185 assert(size > 0);
7186
7187 encoding = code_page_name(code_page, &encoding_obj);
7188 if (encoding == NULL)
7189 return -1;
7190
Victor Stinner7d00cc12014-03-17 23:08:06 +01007191 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7193 UnicodeDecodeError. */
7194 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7195 if (exc != NULL) {
7196 PyCodec_StrictErrors(exc);
7197 Py_CLEAR(exc);
7198 }
7199 goto error;
7200 }
7201
7202 if (*v == NULL) {
7203 /* Create unicode object */
7204 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205 PyErr_NoMemory();
7206 goto error;
7207 }
Victor Stinnerab595942011-12-17 04:59:06 +01007208 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007209 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 if (*v == NULL)
7211 goto error;
7212 startout = PyUnicode_AS_UNICODE(*v);
7213 }
7214 else {
7215 /* Extend unicode object */
7216 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7217 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7218 PyErr_NoMemory();
7219 goto error;
7220 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007221 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 goto error;
7223 startout = PyUnicode_AS_UNICODE(*v) + n;
7224 }
7225
7226 /* Decode the byte string character per character */
7227 out = startout;
7228 while (in < endin)
7229 {
7230 /* Decode a character */
7231 insize = 1;
7232 do
7233 {
7234 outsize = MultiByteToWideChar(code_page, flags,
7235 in, insize,
7236 buffer, Py_ARRAY_LENGTH(buffer));
7237 if (outsize > 0)
7238 break;
7239 err = GetLastError();
7240 if (err != ERROR_NO_UNICODE_TRANSLATION
7241 && err != ERROR_INSUFFICIENT_BUFFER)
7242 {
7243 PyErr_SetFromWindowsErr(0);
7244 goto error;
7245 }
7246 insize++;
7247 }
7248 /* 4=maximum length of a UTF-8 sequence */
7249 while (insize <= 4 && (in + insize) <= endin);
7250
7251 if (outsize <= 0) {
7252 Py_ssize_t startinpos, endinpos, outpos;
7253
Victor Stinner7d00cc12014-03-17 23:08:06 +01007254 /* last character in partial decode? */
7255 if (in + insize >= endin && !final)
7256 break;
7257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 startinpos = in - startin;
7259 endinpos = startinpos + 1;
7260 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007261 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 errors, &errorHandler,
7263 encoding, reason,
7264 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007265 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 {
7267 goto error;
7268 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007269 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 }
7271 else {
7272 in += insize;
7273 memcpy(out, buffer, outsize * sizeof(wchar_t));
7274 out += outsize;
7275 }
7276 }
7277
7278 /* write a NUL character at the end */
7279 *out = 0;
7280
7281 /* Extend unicode object */
7282 outsize = out - startout;
7283 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007284 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007286 /* (in - startin) <= size and size is an int */
7287 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007288
7289error:
7290 Py_XDECREF(encoding_obj);
7291 Py_XDECREF(errorHandler);
7292 Py_XDECREF(exc);
7293 return ret;
7294}
7295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296static PyObject *
7297decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007298 const char *s, Py_ssize_t size,
7299 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007300{
Victor Stinner76a31a62011-11-04 00:05:13 +01007301 PyObject *v = NULL;
7302 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 if (code_page < 0) {
7305 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7306 return NULL;
7307 }
7308
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311
Victor Stinner76a31a62011-11-04 00:05:13 +01007312 do
7313 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007315 if (size > INT_MAX) {
7316 chunk_size = INT_MAX;
7317 final = 0;
7318 done = 0;
7319 }
7320 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 {
7323 chunk_size = (int)size;
7324 final = (consumed == NULL);
7325 done = 1;
7326 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327
Victor Stinner76a31a62011-11-04 00:05:13 +01007328 if (chunk_size == 0 && done) {
7329 if (v != NULL)
7330 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007331 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007332 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333
Victor Stinner76a31a62011-11-04 00:05:13 +01007334 converted = decode_code_page_strict(code_page, &v,
7335 s, chunk_size);
7336 if (converted == -2)
7337 converted = decode_code_page_errors(code_page, &v,
7338 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007339 errors, final);
7340 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007341
7342 if (converted < 0) {
7343 Py_XDECREF(v);
7344 return NULL;
7345 }
7346
7347 if (consumed)
7348 *consumed += converted;
7349
7350 s += converted;
7351 size -= converted;
7352 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007353
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007354 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007355}
7356
Alexander Belopolsky40018472011-02-26 01:02:56 +00007357PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007358PyUnicode_DecodeCodePageStateful(int code_page,
7359 const char *s,
7360 Py_ssize_t size,
7361 const char *errors,
7362 Py_ssize_t *consumed)
7363{
7364 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7365}
7366
7367PyObject *
7368PyUnicode_DecodeMBCSStateful(const char *s,
7369 Py_ssize_t size,
7370 const char *errors,
7371 Py_ssize_t *consumed)
7372{
7373 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7374}
7375
7376PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007377PyUnicode_DecodeMBCS(const char *s,
7378 Py_ssize_t size,
7379 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007380{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7382}
7383
Victor Stinner3a50e702011-10-18 21:21:00 +02007384static DWORD
7385encode_code_page_flags(UINT code_page, const char *errors)
7386{
7387 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007388 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 }
7390 else if (code_page == CP_UTF7) {
7391 /* CP_UTF7 only supports flags=0 */
7392 return 0;
7393 }
7394 else {
7395 if (errors != NULL && strcmp(errors, "replace") == 0)
7396 return 0;
7397 else
7398 return WC_NO_BEST_FIT_CHARS;
7399 }
7400}
7401
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007402/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 * Encode a Unicode string to a Windows code page into a byte string in strict
7404 * mode.
7405 *
7406 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007407 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007408 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007409static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007410encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007411 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413{
Victor Stinner554f3f02010-06-16 23:33:54 +00007414 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 BOOL *pusedDefaultChar = &usedDefaultChar;
7416 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007417 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007418 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 const DWORD flags = encode_code_page_flags(code_page, NULL);
7420 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007421 /* Create a substring so that we can get the UTF-16 representation
7422 of just the slice under consideration. */
7423 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424
Martin v. Löwis3d325192011-11-04 18:23:06 +01007425 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007426
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007428 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007430 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007431
Victor Stinner2fc507f2011-11-04 20:06:39 +01007432 substring = PyUnicode_Substring(unicode, offset, offset+len);
7433 if (substring == NULL)
7434 return -1;
7435 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7436 if (p == NULL) {
7437 Py_DECREF(substring);
7438 return -1;
7439 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007440 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007441
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007442 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007444 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 NULL, 0,
7446 NULL, pusedDefaultChar);
7447 if (outsize <= 0)
7448 goto error;
7449 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007450 if (pusedDefaultChar && *pusedDefaultChar) {
7451 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007453 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007454
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 if (*outbytes == NULL) {
7459 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007463 }
7464 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 const Py_ssize_t n = PyBytes_Size(*outbytes);
7467 if (outsize > PY_SSIZE_T_MAX - n) {
7468 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007469 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7473 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477 }
7478
7479 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007481 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 out, outsize,
7483 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 if (outsize <= 0)
7486 goto error;
7487 if (pusedDefaultChar && *pusedDefaultChar)
7488 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007490
Victor Stinner3a50e702011-10-18 21:21:00 +02007491error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7494 return -2;
7495 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007496 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007497}
7498
Victor Stinner3a50e702011-10-18 21:21:00 +02007499/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007500 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 * error handler.
7502 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007503 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 * -1 on other error.
7505 */
7506static int
7507encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007508 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007509 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007510{
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007512 Py_ssize_t pos = unicode_offset;
7513 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 /* Ideally, we should get reason from FormatMessage. This is the Windows
7515 2000 English version of the message. */
7516 const char *reason = "invalid character";
7517 /* 4=maximum length of a UTF-8 sequence */
7518 char buffer[4];
7519 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7520 Py_ssize_t outsize;
7521 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 PyObject *errorHandler = NULL;
7523 PyObject *exc = NULL;
7524 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007525 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007526 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 PyObject *rep;
7528 int ret = -1;
7529
7530 assert(insize > 0);
7531
7532 encoding = code_page_name(code_page, &encoding_obj);
7533 if (encoding == NULL)
7534 return -1;
7535
7536 if (errors == NULL || strcmp(errors, "strict") == 0) {
7537 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7538 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007539 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 if (exc != NULL) {
7541 PyCodec_StrictErrors(exc);
7542 Py_DECREF(exc);
7543 }
7544 Py_XDECREF(encoding_obj);
7545 return -1;
7546 }
7547
7548 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7549 pusedDefaultChar = &usedDefaultChar;
7550 else
7551 pusedDefaultChar = NULL;
7552
7553 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7554 PyErr_NoMemory();
7555 goto error;
7556 }
7557 outsize = insize * Py_ARRAY_LENGTH(buffer);
7558
7559 if (*outbytes == NULL) {
7560 /* Create string object */
7561 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7562 if (*outbytes == NULL)
7563 goto error;
7564 out = PyBytes_AS_STRING(*outbytes);
7565 }
7566 else {
7567 /* Extend string object */
7568 Py_ssize_t n = PyBytes_Size(*outbytes);
7569 if (n > PY_SSIZE_T_MAX - outsize) {
7570 PyErr_NoMemory();
7571 goto error;
7572 }
7573 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7574 goto error;
7575 out = PyBytes_AS_STRING(*outbytes) + n;
7576 }
7577
7578 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007579 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007581 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7582 wchar_t chars[2];
7583 int charsize;
7584 if (ch < 0x10000) {
7585 chars[0] = (wchar_t)ch;
7586 charsize = 1;
7587 }
7588 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007589 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7590 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007591 charsize = 2;
7592 }
7593
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007595 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 buffer, Py_ARRAY_LENGTH(buffer),
7597 NULL, pusedDefaultChar);
7598 if (outsize > 0) {
7599 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7600 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007601 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 memcpy(out, buffer, outsize);
7603 out += outsize;
7604 continue;
7605 }
7606 }
7607 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7608 PyErr_SetFromWindowsErr(0);
7609 goto error;
7610 }
7611
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 rep = unicode_encode_call_errorhandler(
7613 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007614 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007615 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 if (rep == NULL)
7617 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007618 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007619
7620 if (PyBytes_Check(rep)) {
7621 outsize = PyBytes_GET_SIZE(rep);
7622 if (outsize != 1) {
7623 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7624 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7625 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7626 Py_DECREF(rep);
7627 goto error;
7628 }
7629 out = PyBytes_AS_STRING(*outbytes) + offset;
7630 }
7631 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7632 out += outsize;
7633 }
7634 else {
7635 Py_ssize_t i;
7636 enum PyUnicode_Kind kind;
7637 void *data;
7638
Benjamin Petersonbac79492012-01-14 13:34:47 -05007639 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 Py_DECREF(rep);
7641 goto error;
7642 }
7643
7644 outsize = PyUnicode_GET_LENGTH(rep);
7645 if (outsize != 1) {
7646 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7647 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7648 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7649 Py_DECREF(rep);
7650 goto error;
7651 }
7652 out = PyBytes_AS_STRING(*outbytes) + offset;
7653 }
7654 kind = PyUnicode_KIND(rep);
7655 data = PyUnicode_DATA(rep);
7656 for (i=0; i < outsize; i++) {
7657 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7658 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007659 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007660 encoding, unicode,
7661 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 "unable to encode error handler result to ASCII");
7663 Py_DECREF(rep);
7664 goto error;
7665 }
7666 *out = (unsigned char)ch;
7667 out++;
7668 }
7669 }
7670 Py_DECREF(rep);
7671 }
7672 /* write a NUL byte */
7673 *out = 0;
7674 outsize = out - PyBytes_AS_STRING(*outbytes);
7675 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7676 if (_PyBytes_Resize(outbytes, outsize) < 0)
7677 goto error;
7678 ret = 0;
7679
7680error:
7681 Py_XDECREF(encoding_obj);
7682 Py_XDECREF(errorHandler);
7683 Py_XDECREF(exc);
7684 return ret;
7685}
7686
Victor Stinner3a50e702011-10-18 21:21:00 +02007687static PyObject *
7688encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007689 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 const char *errors)
7691{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007692 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007694 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007695 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007696
Victor Stinner29dacf22015-01-26 16:41:32 +01007697 if (!PyUnicode_Check(unicode)) {
7698 PyErr_BadArgument();
7699 return NULL;
7700 }
7701
Benjamin Petersonbac79492012-01-14 13:34:47 -05007702 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007703 return NULL;
7704 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007705
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 if (code_page < 0) {
7707 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7708 return NULL;
7709 }
7710
Martin v. Löwis3d325192011-11-04 18:23:06 +01007711 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007712 return PyBytes_FromStringAndSize(NULL, 0);
7713
Victor Stinner7581cef2011-11-03 22:32:33 +01007714 offset = 0;
7715 do
7716 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007717#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007718 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007719 chunks. */
7720 if (len > INT_MAX/2) {
7721 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007722 done = 0;
7723 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007724 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007725#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007727 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007728 done = 1;
7729 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007730
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 errors);
7734 if (ret == -2)
7735 ret = encode_code_page_errors(code_page, &outbytes,
7736 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 if (ret < 0) {
7739 Py_XDECREF(outbytes);
7740 return NULL;
7741 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007742
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007746
Victor Stinner3a50e702011-10-18 21:21:00 +02007747 return outbytes;
7748}
7749
7750PyObject *
7751PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7752 Py_ssize_t size,
7753 const char *errors)
7754{
Victor Stinner7581cef2011-11-03 22:32:33 +01007755 PyObject *unicode, *res;
7756 unicode = PyUnicode_FromUnicode(p, size);
7757 if (unicode == NULL)
7758 return NULL;
7759 res = encode_code_page(CP_ACP, unicode, errors);
7760 Py_DECREF(unicode);
7761 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007762}
7763
7764PyObject *
7765PyUnicode_EncodeCodePage(int code_page,
7766 PyObject *unicode,
7767 const char *errors)
7768{
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007770}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007771
Alexander Belopolsky40018472011-02-26 01:02:56 +00007772PyObject *
7773PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007774{
Victor Stinner7581cef2011-11-03 22:32:33 +01007775 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007776}
7777
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007778#undef NEED_RETRY
7779
Victor Stinner99b95382011-07-04 14:23:54 +02007780#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007781
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782/* --- Character Mapping Codec -------------------------------------------- */
7783
Victor Stinnerfb161b12013-04-18 01:44:27 +02007784static int
7785charmap_decode_string(const char *s,
7786 Py_ssize_t size,
7787 PyObject *mapping,
7788 const char *errors,
7789 _PyUnicodeWriter *writer)
7790{
7791 const char *starts = s;
7792 const char *e;
7793 Py_ssize_t startinpos, endinpos;
7794 PyObject *errorHandler = NULL, *exc = NULL;
7795 Py_ssize_t maplen;
7796 enum PyUnicode_Kind mapkind;
7797 void *mapdata;
7798 Py_UCS4 x;
7799 unsigned char ch;
7800
7801 if (PyUnicode_READY(mapping) == -1)
7802 return -1;
7803
7804 maplen = PyUnicode_GET_LENGTH(mapping);
7805 mapdata = PyUnicode_DATA(mapping);
7806 mapkind = PyUnicode_KIND(mapping);
7807
7808 e = s + size;
7809
7810 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7811 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7812 * is disabled in encoding aliases, latin1 is preferred because
7813 * its implementation is faster. */
7814 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7815 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7816 Py_UCS4 maxchar = writer->maxchar;
7817
7818 assert (writer->kind == PyUnicode_1BYTE_KIND);
7819 while (s < e) {
7820 ch = *s;
7821 x = mapdata_ucs1[ch];
7822 if (x > maxchar) {
7823 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7824 goto onError;
7825 maxchar = writer->maxchar;
7826 outdata = (Py_UCS1 *)writer->data;
7827 }
7828 outdata[writer->pos] = x;
7829 writer->pos++;
7830 ++s;
7831 }
7832 return 0;
7833 }
7834
7835 while (s < e) {
7836 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7837 enum PyUnicode_Kind outkind = writer->kind;
7838 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7839 if (outkind == PyUnicode_1BYTE_KIND) {
7840 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7841 Py_UCS4 maxchar = writer->maxchar;
7842 while (s < e) {
7843 ch = *s;
7844 x = mapdata_ucs2[ch];
7845 if (x > maxchar)
7846 goto Error;
7847 outdata[writer->pos] = x;
7848 writer->pos++;
7849 ++s;
7850 }
7851 break;
7852 }
7853 else if (outkind == PyUnicode_2BYTE_KIND) {
7854 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7855 while (s < e) {
7856 ch = *s;
7857 x = mapdata_ucs2[ch];
7858 if (x == 0xFFFE)
7859 goto Error;
7860 outdata[writer->pos] = x;
7861 writer->pos++;
7862 ++s;
7863 }
7864 break;
7865 }
7866 }
7867 ch = *s;
7868
7869 if (ch < maplen)
7870 x = PyUnicode_READ(mapkind, mapdata, ch);
7871 else
7872 x = 0xfffe; /* invalid value */
7873Error:
7874 if (x == 0xfffe)
7875 {
7876 /* undefined mapping */
7877 startinpos = s-starts;
7878 endinpos = startinpos+1;
7879 if (unicode_decode_call_errorhandler_writer(
7880 errors, &errorHandler,
7881 "charmap", "character maps to <undefined>",
7882 &starts, &e, &startinpos, &endinpos, &exc, &s,
7883 writer)) {
7884 goto onError;
7885 }
7886 continue;
7887 }
7888
7889 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7890 goto onError;
7891 ++s;
7892 }
7893 Py_XDECREF(errorHandler);
7894 Py_XDECREF(exc);
7895 return 0;
7896
7897onError:
7898 Py_XDECREF(errorHandler);
7899 Py_XDECREF(exc);
7900 return -1;
7901}
7902
7903static int
7904charmap_decode_mapping(const char *s,
7905 Py_ssize_t size,
7906 PyObject *mapping,
7907 const char *errors,
7908 _PyUnicodeWriter *writer)
7909{
7910 const char *starts = s;
7911 const char *e;
7912 Py_ssize_t startinpos, endinpos;
7913 PyObject *errorHandler = NULL, *exc = NULL;
7914 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007915 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007916
7917 e = s + size;
7918
7919 while (s < e) {
7920 ch = *s;
7921
7922 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7923 key = PyLong_FromLong((long)ch);
7924 if (key == NULL)
7925 goto onError;
7926
7927 item = PyObject_GetItem(mapping, key);
7928 Py_DECREF(key);
7929 if (item == NULL) {
7930 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7931 /* No mapping found means: mapping is undefined. */
7932 PyErr_Clear();
7933 goto Undefined;
7934 } else
7935 goto onError;
7936 }
7937
7938 /* Apply mapping */
7939 if (item == Py_None)
7940 goto Undefined;
7941 if (PyLong_Check(item)) {
7942 long value = PyLong_AS_LONG(item);
7943 if (value == 0xFFFE)
7944 goto Undefined;
7945 if (value < 0 || value > MAX_UNICODE) {
7946 PyErr_Format(PyExc_TypeError,
7947 "character mapping must be in range(0x%lx)",
7948 (unsigned long)MAX_UNICODE + 1);
7949 goto onError;
7950 }
7951
7952 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7953 goto onError;
7954 }
7955 else if (PyUnicode_Check(item)) {
7956 if (PyUnicode_READY(item) == -1)
7957 goto onError;
7958 if (PyUnicode_GET_LENGTH(item) == 1) {
7959 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7960 if (value == 0xFFFE)
7961 goto Undefined;
7962 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7963 goto onError;
7964 }
7965 else {
7966 writer->overallocate = 1;
7967 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7968 goto onError;
7969 }
7970 }
7971 else {
7972 /* wrong return value */
7973 PyErr_SetString(PyExc_TypeError,
7974 "character mapping must return integer, None or str");
7975 goto onError;
7976 }
7977 Py_CLEAR(item);
7978 ++s;
7979 continue;
7980
7981Undefined:
7982 /* undefined mapping */
7983 Py_CLEAR(item);
7984 startinpos = s-starts;
7985 endinpos = startinpos+1;
7986 if (unicode_decode_call_errorhandler_writer(
7987 errors, &errorHandler,
7988 "charmap", "character maps to <undefined>",
7989 &starts, &e, &startinpos, &endinpos, &exc, &s,
7990 writer)) {
7991 goto onError;
7992 }
7993 }
7994 Py_XDECREF(errorHandler);
7995 Py_XDECREF(exc);
7996 return 0;
7997
7998onError:
7999 Py_XDECREF(item);
8000 Py_XDECREF(errorHandler);
8001 Py_XDECREF(exc);
8002 return -1;
8003}
8004
Alexander Belopolsky40018472011-02-26 01:02:56 +00008005PyObject *
8006PyUnicode_DecodeCharmap(const char *s,
8007 Py_ssize_t size,
8008 PyObject *mapping,
8009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008011 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008012
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 /* Default to Latin-1 */
8014 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008018 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008019 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008020 writer.min_length = size;
8021 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008023
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008024 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008025 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8026 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008027 }
8028 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008029 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8030 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008032 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008033
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008035 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 return NULL;
8037}
8038
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039/* Charmap encoding: the lookup table */
8040
Alexander Belopolsky40018472011-02-26 01:02:56 +00008041struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 PyObject_HEAD
8043 unsigned char level1[32];
8044 int count2, count3;
8045 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046};
8047
8048static PyObject*
8049encoding_map_size(PyObject *obj, PyObject* args)
8050{
8051 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008052 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054}
8055
8056static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 PyDoc_STR("Return the size (in bytes) of this object") },
8059 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060};
8061
8062static void
8063encoding_map_dealloc(PyObject* o)
8064{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066}
8067
8068static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008069 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 "EncodingMap", /*tp_name*/
8071 sizeof(struct encoding_map), /*tp_basicsize*/
8072 0, /*tp_itemsize*/
8073 /* methods */
8074 encoding_map_dealloc, /*tp_dealloc*/
8075 0, /*tp_print*/
8076 0, /*tp_getattr*/
8077 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008078 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 0, /*tp_repr*/
8080 0, /*tp_as_number*/
8081 0, /*tp_as_sequence*/
8082 0, /*tp_as_mapping*/
8083 0, /*tp_hash*/
8084 0, /*tp_call*/
8085 0, /*tp_str*/
8086 0, /*tp_getattro*/
8087 0, /*tp_setattro*/
8088 0, /*tp_as_buffer*/
8089 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8090 0, /*tp_doc*/
8091 0, /*tp_traverse*/
8092 0, /*tp_clear*/
8093 0, /*tp_richcompare*/
8094 0, /*tp_weaklistoffset*/
8095 0, /*tp_iter*/
8096 0, /*tp_iternext*/
8097 encoding_map_methods, /*tp_methods*/
8098 0, /*tp_members*/
8099 0, /*tp_getset*/
8100 0, /*tp_base*/
8101 0, /*tp_dict*/
8102 0, /*tp_descr_get*/
8103 0, /*tp_descr_set*/
8104 0, /*tp_dictoffset*/
8105 0, /*tp_init*/
8106 0, /*tp_alloc*/
8107 0, /*tp_new*/
8108 0, /*tp_free*/
8109 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110};
8111
8112PyObject*
8113PyUnicode_BuildEncodingMap(PyObject* string)
8114{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115 PyObject *result;
8116 struct encoding_map *mresult;
8117 int i;
8118 int need_dict = 0;
8119 unsigned char level1[32];
8120 unsigned char level2[512];
8121 unsigned char *mlevel1, *mlevel2, *mlevel3;
8122 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 int kind;
8124 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008125 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008127
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008128 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 PyErr_BadArgument();
8130 return NULL;
8131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 kind = PyUnicode_KIND(string);
8133 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 length = PyUnicode_GET_LENGTH(string);
8135 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136 memset(level1, 0xFF, sizeof level1);
8137 memset(level2, 0xFF, sizeof level2);
8138
8139 /* If there isn't a one-to-one mapping of NULL to \0,
8140 or if there are non-BMP characters, we need to use
8141 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008144 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 ch = PyUnicode_READ(kind, data, i);
8147 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 need_dict = 1;
8149 break;
8150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 /* unmapped character */
8153 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008154 l1 = ch >> 11;
8155 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 if (level1[l1] == 0xFF)
8157 level1[l1] = count2++;
8158 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 }
8161
8162 if (count2 >= 0xFF || count3 >= 0xFF)
8163 need_dict = 1;
8164
8165 if (need_dict) {
8166 PyObject *result = PyDict_New();
8167 PyObject *key, *value;
8168 if (!result)
8169 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008170 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008172 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173 if (!key || !value)
8174 goto failed1;
8175 if (PyDict_SetItem(result, key, value) == -1)
8176 goto failed1;
8177 Py_DECREF(key);
8178 Py_DECREF(value);
8179 }
8180 return result;
8181 failed1:
8182 Py_XDECREF(key);
8183 Py_XDECREF(value);
8184 Py_DECREF(result);
8185 return NULL;
8186 }
8187
8188 /* Create a three-level trie */
8189 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8190 16*count2 + 128*count3 - 1);
8191 if (!result)
8192 return PyErr_NoMemory();
8193 PyObject_Init(result, &EncodingMapType);
8194 mresult = (struct encoding_map*)result;
8195 mresult->count2 = count2;
8196 mresult->count3 = count3;
8197 mlevel1 = mresult->level1;
8198 mlevel2 = mresult->level23;
8199 mlevel3 = mresult->level23 + 16*count2;
8200 memcpy(mlevel1, level1, 32);
8201 memset(mlevel2, 0xFF, 16*count2);
8202 memset(mlevel3, 0, 128*count3);
8203 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008204 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008206 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8207 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008208 /* unmapped character */
8209 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008210 o1 = ch>>11;
8211 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212 i2 = 16*mlevel1[o1] + o2;
8213 if (mlevel2[i2] == 0xFF)
8214 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008216 i3 = 128*mlevel2[i2] + o3;
8217 mlevel3[i3] = i;
8218 }
8219 return result;
8220}
8221
8222static int
Victor Stinner22168992011-11-20 17:09:18 +01008223encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008224{
8225 struct encoding_map *map = (struct encoding_map*)mapping;
8226 int l1 = c>>11;
8227 int l2 = (c>>7) & 0xF;
8228 int l3 = c & 0x7F;
8229 int i;
8230
Victor Stinner22168992011-11-20 17:09:18 +01008231 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008233 if (c == 0)
8234 return 0;
8235 /* level 1*/
8236 i = map->level1[l1];
8237 if (i == 0xFF) {
8238 return -1;
8239 }
8240 /* level 2*/
8241 i = map->level23[16*i+l2];
8242 if (i == 0xFF) {
8243 return -1;
8244 }
8245 /* level 3 */
8246 i = map->level23[16*map->count2 + 128*i + l3];
8247 if (i == 0) {
8248 return -1;
8249 }
8250 return i;
8251}
8252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253/* Lookup the character ch in the mapping. If the character
8254 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008255 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008256static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008257charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258{
Christian Heimes217cfd12007-12-02 14:31:20 +00008259 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 PyObject *x;
8261
8262 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 x = PyObject_GetItem(mapping, w);
8265 Py_DECREF(w);
8266 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8268 /* No mapping found means: mapping is undefined. */
8269 PyErr_Clear();
8270 x = Py_None;
8271 Py_INCREF(x);
8272 return x;
8273 } else
8274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008276 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008278 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 long value = PyLong_AS_LONG(x);
8280 if (value < 0 || value > 255) {
8281 PyErr_SetString(PyExc_TypeError,
8282 "character mapping must be in range(256)");
8283 Py_DECREF(x);
8284 return NULL;
8285 }
8286 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008288 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 /* wrong return value */
8292 PyErr_Format(PyExc_TypeError,
8293 "character mapping must return integer, bytes or None, not %.400s",
8294 x->ob_type->tp_name);
8295 Py_DECREF(x);
8296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 }
8298}
8299
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008300static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008301charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008302{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008303 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8304 /* exponentially overallocate to minimize reallocations */
8305 if (requiredsize < 2*outsize)
8306 requiredsize = 2*outsize;
8307 if (_PyBytes_Resize(outobj, requiredsize))
8308 return -1;
8309 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310}
8311
Benjamin Peterson14339b62009-01-31 16:36:08 +00008312typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008314} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008316 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 space is available. Return a new reference to the object that
8318 was put in the output buffer, or Py_None, if the mapping was undefined
8319 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008320 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008321static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008322charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008325 PyObject *rep;
8326 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008327 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328
Christian Heimes90aa7642007-12-19 02:45:37 +00008329 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 if (res == -1)
8333 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 if (outsize<requiredsize)
8335 if (charmapencode_resize(outobj, outpos, requiredsize))
8336 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008337 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 outstart[(*outpos)++] = (char)res;
8339 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340 }
8341
8342 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 Py_DECREF(rep);
8347 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 if (PyLong_Check(rep)) {
8350 Py_ssize_t requiredsize = *outpos+1;
8351 if (outsize<requiredsize)
8352 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8353 Py_DECREF(rep);
8354 return enc_EXCEPTION;
8355 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008356 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 else {
8360 const char *repchars = PyBytes_AS_STRING(rep);
8361 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8362 Py_ssize_t requiredsize = *outpos+repsize;
8363 if (outsize<requiredsize)
8364 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8365 Py_DECREF(rep);
8366 return enc_EXCEPTION;
8367 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008368 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 memcpy(outstart + *outpos, repchars, repsize);
8370 *outpos += repsize;
8371 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008373 Py_DECREF(rep);
8374 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375}
8376
8377/* handle an error in PyUnicode_EncodeCharmap
8378 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008379static int
8380charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008381 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008383 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008384 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385{
8386 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008388 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008389 enum PyUnicode_Kind kind;
8390 void *data;
8391 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008393 Py_ssize_t collstartpos = *inpos;
8394 Py_ssize_t collendpos = *inpos+1;
8395 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 char *encoding = "charmap";
8397 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008398 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008399 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008400 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401
Benjamin Petersonbac79492012-01-14 13:34:47 -05008402 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008403 return -1;
8404 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 /* find all unencodable characters */
8406 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008407 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008408 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008409 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008410 val = encoding_map_lookup(ch, mapping);
8411 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 break;
8413 ++collendpos;
8414 continue;
8415 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008416
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8418 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 if (rep==NULL)
8420 return -1;
8421 else if (rep!=Py_None) {
8422 Py_DECREF(rep);
8423 break;
8424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008425 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 }
8428 /* cache callback name lookup
8429 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008430 if (*error_handler == _Py_ERROR_UNKNOWN)
8431 *error_handler = get_error_handler(errors);
8432
8433 switch (*error_handler) {
8434 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008435 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008437
8438 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 x = charmapencode_output('?', mapping, res, respos);
8441 if (x==enc_EXCEPTION) {
8442 return -1;
8443 }
8444 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008445 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 return -1;
8447 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 }
8449 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008450 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 *inpos = collendpos;
8452 break;
Victor Stinner50149202015-09-22 00:26:54 +02008453
8454 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 /* generate replacement (temporarily (mis)uses p) */
8456 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 char buffer[2+29+1+1];
8458 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008459 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 for (cp = buffer; *cp; ++cp) {
8461 x = charmapencode_output(*cp, mapping, res, respos);
8462 if (x==enc_EXCEPTION)
8463 return -1;
8464 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008465 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 return -1;
8467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 }
8469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 *inpos = collendpos;
8471 break;
Victor Stinner50149202015-09-22 00:26:54 +02008472
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 default:
Victor Stinner50149202015-09-22 00:26:54 +02008474 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008475 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008479 if (PyBytes_Check(repunicode)) {
8480 /* Directly copy bytes result to output. */
8481 Py_ssize_t outsize = PyBytes_Size(*res);
8482 Py_ssize_t requiredsize;
8483 repsize = PyBytes_Size(repunicode);
8484 requiredsize = *respos + repsize;
8485 if (requiredsize > outsize)
8486 /* Make room for all additional bytes. */
8487 if (charmapencode_resize(res, respos, requiredsize)) {
8488 Py_DECREF(repunicode);
8489 return -1;
8490 }
8491 memcpy(PyBytes_AsString(*res) + *respos,
8492 PyBytes_AsString(repunicode), repsize);
8493 *respos += repsize;
8494 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008495 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008496 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008497 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008498 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008499 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008500 Py_DECREF(repunicode);
8501 return -1;
8502 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008503 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008504 data = PyUnicode_DATA(repunicode);
8505 kind = PyUnicode_KIND(repunicode);
8506 for (index = 0; index < repsize; index++) {
8507 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8508 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008510 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 return -1;
8512 }
8513 else if (x==enc_FAILED) {
8514 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008515 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
8517 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008518 }
8519 *inpos = newpos;
8520 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 }
8522 return 0;
8523}
8524
Alexander Belopolsky40018472011-02-26 01:02:56 +00008525PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008526_PyUnicode_EncodeCharmap(PyObject *unicode,
8527 PyObject *mapping,
8528 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 /* output object */
8531 PyObject *res = NULL;
8532 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008533 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008534 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008536 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008537 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008539 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008540 void *data;
8541 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
Benjamin Petersonbac79492012-01-14 13:34:47 -05008543 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008544 return NULL;
8545 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008546 data = PyUnicode_DATA(unicode);
8547 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008548
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 /* Default to Latin-1 */
8550 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 /* allocate enough for a simple encoding without
8554 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008555 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 if (res == NULL)
8557 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008558 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008562 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 if (x==enc_EXCEPTION) /* error */
8566 goto onError;
8567 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008568 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008570 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 &res, &respos)) {
8572 goto onError;
8573 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 else
8576 /* done with this character => adjust input position */
8577 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008581 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008582 if (_PyBytes_Resize(&res, respos) < 0)
8583 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008586 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 return res;
8588
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 Py_XDECREF(res);
8591 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008592 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 return NULL;
8594}
8595
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008596/* Deprecated */
8597PyObject *
8598PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8599 Py_ssize_t size,
8600 PyObject *mapping,
8601 const char *errors)
8602{
8603 PyObject *result;
8604 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8605 if (unicode == NULL)
8606 return NULL;
8607 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8608 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008609 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008610}
8611
Alexander Belopolsky40018472011-02-26 01:02:56 +00008612PyObject *
8613PyUnicode_AsCharmapString(PyObject *unicode,
8614 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615{
8616 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 PyErr_BadArgument();
8618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008620 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621}
8622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008624static void
8625make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627 Py_ssize_t startpos, Py_ssize_t endpos,
8628 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 *exceptionObject = _PyUnicodeTranslateError_Create(
8632 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 }
8634 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8636 goto onError;
8637 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8638 goto onError;
8639 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8640 goto onError;
8641 return;
8642 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008643 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 }
8645}
8646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647/* error handling callback helper:
8648 build arguments, call the callback and check the arguments,
8649 put the result into newpos and return the replacement string, which
8650 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008651static PyObject *
8652unicode_translate_call_errorhandler(const char *errors,
8653 PyObject **errorHandler,
8654 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008656 Py_ssize_t startpos, Py_ssize_t endpos,
8657 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008659 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008661 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 PyObject *restuple;
8663 PyObject *resunicode;
8664
8665 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 }
8670
8671 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675
8676 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008681 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 Py_DECREF(restuple);
8683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 }
8685 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 &resunicode, &i_newpos)) {
8687 Py_DECREF(restuple);
8688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008690 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 else
8693 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008695 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 Py_DECREF(restuple);
8697 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 Py_INCREF(resunicode);
8700 Py_DECREF(restuple);
8701 return resunicode;
8702}
8703
8704/* Lookup the character ch in the mapping and put the result in result,
8705 which must be decrefed by the caller.
8706 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008707static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709{
Christian Heimes217cfd12007-12-02 14:31:20 +00008710 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 PyObject *x;
8712
8713 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 x = PyObject_GetItem(mapping, w);
8716 Py_DECREF(w);
8717 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8719 /* No mapping found means: use 1:1 mapping. */
8720 PyErr_Clear();
8721 *result = NULL;
8722 return 0;
8723 } else
8724 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725 }
8726 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 *result = x;
8728 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008730 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008732 if (value < 0 || value > MAX_UNICODE) {
8733 PyErr_Format(PyExc_ValueError,
8734 "character mapping must be in range(0x%x)",
8735 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 Py_DECREF(x);
8737 return -1;
8738 }
8739 *result = x;
8740 return 0;
8741 }
8742 else if (PyUnicode_Check(x)) {
8743 *result = x;
8744 return 0;
8745 }
8746 else {
8747 /* wrong return value */
8748 PyErr_SetString(PyExc_TypeError,
8749 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008750 Py_DECREF(x);
8751 return -1;
8752 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008753}
Victor Stinner1194ea02014-04-04 19:37:40 +02008754
8755/* lookup the character, write the result into the writer.
8756 Return 1 if the result was written into the writer, return 0 if the mapping
8757 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008758static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008759charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8760 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761{
Victor Stinner1194ea02014-04-04 19:37:40 +02008762 PyObject *item;
8763
8764 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008766
8767 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008769 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008772 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008774
8775 if (item == Py_None) {
8776 Py_DECREF(item);
8777 return 0;
8778 }
8779
8780 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008781 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8782 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8783 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008784 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8785 Py_DECREF(item);
8786 return -1;
8787 }
8788 Py_DECREF(item);
8789 return 1;
8790 }
8791
8792 if (!PyUnicode_Check(item)) {
8793 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008795 }
8796
8797 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8798 Py_DECREF(item);
8799 return -1;
8800 }
8801
8802 Py_DECREF(item);
8803 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804}
8805
Victor Stinner89a76ab2014-04-05 11:44:04 +02008806static int
8807unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8808 Py_UCS1 *translate)
8809{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008810 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008811 int ret = 0;
8812
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813 if (charmaptranslate_lookup(ch, mapping, &item)) {
8814 return -1;
8815 }
8816
8817 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008818 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008819 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008821 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 /* not found => default to 1:1 mapping */
8823 translate[ch] = ch;
8824 return 1;
8825 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008826 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008827 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008828 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8829 used it */
8830 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008831 /* invalid character or character outside ASCII:
8832 skip the fast translate */
8833 goto exit;
8834 }
8835 translate[ch] = (Py_UCS1)replace;
8836 }
8837 else if (PyUnicode_Check(item)) {
8838 Py_UCS4 replace;
8839
8840 if (PyUnicode_READY(item) == -1) {
8841 Py_DECREF(item);
8842 return -1;
8843 }
8844 if (PyUnicode_GET_LENGTH(item) != 1)
8845 goto exit;
8846
8847 replace = PyUnicode_READ_CHAR(item, 0);
8848 if (replace > 127)
8849 goto exit;
8850 translate[ch] = (Py_UCS1)replace;
8851 }
8852 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008853 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 goto exit;
8855 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 ret = 1;
8857
Benjamin Peterson1365de72014-04-07 20:15:41 -04008858 exit:
8859 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860 return ret;
8861}
8862
8863/* Fast path for ascii => ascii translation. Return 1 if the whole string
8864 was translated into writer, return 0 if the input string was partially
8865 translated into writer, raise an exception and return -1 on error. */
8866static int
8867unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008868 _PyUnicodeWriter *writer, int ignore,
8869 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008870{
Victor Stinner872b2912014-04-05 14:27:07 +02008871 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872 Py_ssize_t len;
8873 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008874 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875
Victor Stinner89a76ab2014-04-05 11:44:04 +02008876 len = PyUnicode_GET_LENGTH(input);
8877
Victor Stinner872b2912014-04-05 14:27:07 +02008878 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879
8880 in = PyUnicode_1BYTE_DATA(input);
8881 end = in + len;
8882
8883 assert(PyUnicode_IS_ASCII(writer->buffer));
8884 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8885 out = PyUnicode_1BYTE_DATA(writer->buffer);
8886
Victor Stinner872b2912014-04-05 14:27:07 +02008887 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008889 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008891 int translate = unicode_fast_translate_lookup(mapping, ch,
8892 ascii_table);
8893 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008895 if (translate == 0)
8896 goto exit;
8897 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898 }
Victor Stinner872b2912014-04-05 14:27:07 +02008899 if (ch2 == 0xfe) {
8900 if (ignore)
8901 continue;
8902 goto exit;
8903 }
8904 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008905 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008906 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008907 }
Victor Stinner872b2912014-04-05 14:27:07 +02008908 res = 1;
8909
8910exit:
8911 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008912 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008913 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914}
8915
Victor Stinner3222da22015-10-01 22:07:32 +02008916static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917_PyUnicode_TranslateCharmap(PyObject *input,
8918 PyObject *mapping,
8919 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008922 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 Py_ssize_t size, i;
8924 int kind;
8925 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008926 _PyUnicodeWriter writer;
8927 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008928 char *reason = "character maps to <undefined>";
8929 PyObject *errorHandler = NULL;
8930 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008932 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008933
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 PyErr_BadArgument();
8936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 if (PyUnicode_READY(input) == -1)
8940 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008941 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 kind = PyUnicode_KIND(input);
8943 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008945 if (size == 0)
8946 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948 /* allocate enough for a simple 1:1 translation without
8949 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008950 _PyUnicodeWriter_Init(&writer);
8951 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953
Victor Stinner872b2912014-04-05 14:27:07 +02008954 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8955
Victor Stinner33798672016-03-01 21:59:58 +01008956 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008957 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008958 if (PyUnicode_IS_ASCII(input)) {
8959 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8960 if (res < 0) {
8961 _PyUnicodeWriter_Dealloc(&writer);
8962 return NULL;
8963 }
8964 if (res == 1)
8965 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008966 }
Victor Stinner33798672016-03-01 21:59:58 +01008967 else {
8968 i = 0;
8969 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008973 int translate;
8974 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8975 Py_ssize_t newpos;
8976 /* startpos for collecting untranslatable chars */
8977 Py_ssize_t collstart;
8978 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 ch = PyUnicode_READ(kind, data, i);
8982 translate = charmaptranslate_output(ch, mapping, &writer);
8983 if (translate < 0)
8984 goto onError;
8985
8986 if (translate != 0) {
8987 /* it worked => adjust input pointer */
8988 ++i;
8989 continue;
8990 }
8991
8992 /* untranslatable character */
8993 collstart = i;
8994 collend = i+1;
8995
8996 /* find all untranslatable characters */
8997 while (collend < size) {
8998 PyObject *x;
8999 ch = PyUnicode_READ(kind, data, collend);
9000 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009001 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009002 Py_XDECREF(x);
9003 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009005 ++collend;
9006 }
9007
9008 if (ignore) {
9009 i = collend;
9010 }
9011 else {
9012 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9013 reason, input, &exc,
9014 collstart, collend, &newpos);
9015 if (repunicode == NULL)
9016 goto onError;
9017 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009019 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009020 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009021 Py_DECREF(repunicode);
9022 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009023 }
9024 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009025 Py_XDECREF(exc);
9026 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009027 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009030 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009031 Py_XDECREF(exc);
9032 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 return NULL;
9034}
9035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036/* Deprecated. Use PyUnicode_Translate instead. */
9037PyObject *
9038PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9039 Py_ssize_t size,
9040 PyObject *mapping,
9041 const char *errors)
9042{
Christian Heimes5f520f42012-09-11 14:03:25 +02009043 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9045 if (!unicode)
9046 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009047 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9048 Py_DECREF(unicode);
9049 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050}
9051
Alexander Belopolsky40018472011-02-26 01:02:56 +00009052PyObject *
9053PyUnicode_Translate(PyObject *str,
9054 PyObject *mapping,
9055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009057 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009058 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009059 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060}
Tim Petersced69f82003-09-16 20:30:58 +00009061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009063fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064{
9065 /* No need to call PyUnicode_READY(self) because this function is only
9066 called as a callback from fixup() which does it already. */
9067 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9068 const int kind = PyUnicode_KIND(self);
9069 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009070 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009071 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 Py_ssize_t i;
9073
9074 for (i = 0; i < len; ++i) {
9075 ch = PyUnicode_READ(kind, data, i);
9076 fixed = 0;
9077 if (ch > 127) {
9078 if (Py_UNICODE_ISSPACE(ch))
9079 fixed = ' ';
9080 else {
9081 const int decimal = Py_UNICODE_TODECIMAL(ch);
9082 if (decimal >= 0)
9083 fixed = '0' + decimal;
9084 }
9085 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009086 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009087 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 PyUnicode_WRITE(kind, data, i, fixed);
9089 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009090 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009091 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 }
9094
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009095 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096}
9097
9098PyObject *
9099_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9100{
9101 if (!PyUnicode_Check(unicode)) {
9102 PyErr_BadInternalCall();
9103 return NULL;
9104 }
9105 if (PyUnicode_READY(unicode) == -1)
9106 return NULL;
9107 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9108 /* If the string is already ASCII, just return the same string */
9109 Py_INCREF(unicode);
9110 return unicode;
9111 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009112 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113}
9114
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009115PyObject *
9116PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9117 Py_ssize_t length)
9118{
Victor Stinnerf0124502011-11-21 23:12:56 +01009119 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009120 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009121 Py_UCS4 maxchar;
9122 enum PyUnicode_Kind kind;
9123 void *data;
9124
Victor Stinner99d7ad02012-02-22 13:37:39 +01009125 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009126 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009127 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009128 if (ch > 127) {
9129 int decimal = Py_UNICODE_TODECIMAL(ch);
9130 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009131 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009132 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009133 }
9134 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009135
9136 /* Copy to a new string */
9137 decimal = PyUnicode_New(length, maxchar);
9138 if (decimal == NULL)
9139 return decimal;
9140 kind = PyUnicode_KIND(decimal);
9141 data = PyUnicode_DATA(decimal);
9142 /* Iterate over code points */
9143 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009144 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009145 if (ch > 127) {
9146 int decimal = Py_UNICODE_TODECIMAL(ch);
9147 if (decimal >= 0)
9148 ch = '0' + decimal;
9149 }
9150 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009152 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009153}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009154/* --- Decimal Encoder ---------------------------------------------------- */
9155
Alexander Belopolsky40018472011-02-26 01:02:56 +00009156int
9157PyUnicode_EncodeDecimal(Py_UNICODE *s,
9158 Py_ssize_t length,
9159 char *output,
9160 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009161{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009162 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009163 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009164 enum PyUnicode_Kind kind;
9165 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009166
9167 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 PyErr_BadArgument();
9169 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009170 }
9171
Victor Stinner42bf7752011-11-21 22:52:58 +01009172 unicode = PyUnicode_FromUnicode(s, length);
9173 if (unicode == NULL)
9174 return -1;
9175
Benjamin Petersonbac79492012-01-14 13:34:47 -05009176 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009177 Py_DECREF(unicode);
9178 return -1;
9179 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 kind = PyUnicode_KIND(unicode);
9181 data = PyUnicode_DATA(unicode);
9182
Victor Stinnerb84d7232011-11-22 01:50:07 +01009183 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009184 PyObject *exc;
9185 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009187 Py_ssize_t startpos;
9188
9189 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009190
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009193 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 decimal = Py_UNICODE_TODECIMAL(ch);
9197 if (decimal >= 0) {
9198 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009199 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 continue;
9201 }
9202 if (0 < ch && ch < 256) {
9203 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009204 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 continue;
9206 }
Victor Stinner6345be92011-11-25 20:09:01 +01009207
Victor Stinner42bf7752011-11-21 22:52:58 +01009208 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009209 exc = NULL;
9210 raise_encode_exception(&exc, "decimal", unicode,
9211 startpos, startpos+1,
9212 "invalid decimal Unicode string");
9213 Py_XDECREF(exc);
9214 Py_DECREF(unicode);
9215 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009216 }
9217 /* 0-terminate the output string */
9218 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009219 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009221}
9222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223/* --- Helpers ------------------------------------------------------------ */
9224
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009225/* helper macro to fixup start/end slice values */
9226#define ADJUST_INDICES(start, end, len) \
9227 if (end > len) \
9228 end = len; \
9229 else if (end < 0) { \
9230 end += len; \
9231 if (end < 0) \
9232 end = 0; \
9233 } \
9234 if (start < 0) { \
9235 start += len; \
9236 if (start < 0) \
9237 start = 0; \
9238 }
9239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009241any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009243 Py_ssize_t end,
9244 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009246 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 void *buf1, *buf2;
9248 Py_ssize_t len1, len2, result;
9249
9250 kind1 = PyUnicode_KIND(s1);
9251 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009252 if (kind1 < kind2)
9253 return -1;
9254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 len1 = PyUnicode_GET_LENGTH(s1);
9256 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009257 ADJUST_INDICES(start, end, len1);
9258 if (end - start < len2)
9259 return -1;
9260
9261 buf1 = PyUnicode_DATA(s1);
9262 buf2 = PyUnicode_DATA(s2);
9263 if (len2 == 1) {
9264 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9265 result = findchar((const char *)buf1 + kind1*start,
9266 kind1, end - start, ch, direction);
9267 if (result == -1)
9268 return -1;
9269 else
9270 return start + result;
9271 }
9272
9273 if (kind2 != kind1) {
9274 buf2 = _PyUnicode_AsKind(s2, kind1);
9275 if (!buf2)
9276 return -2;
9277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278
Victor Stinner794d5672011-10-10 03:21:36 +02009279 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009280 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009281 case PyUnicode_1BYTE_KIND:
9282 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9283 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9284 else
9285 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9286 break;
9287 case PyUnicode_2BYTE_KIND:
9288 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9289 break;
9290 case PyUnicode_4BYTE_KIND:
9291 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9292 break;
9293 default:
9294 assert(0); result = -2;
9295 }
9296 }
9297 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009298 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009299 case PyUnicode_1BYTE_KIND:
9300 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9301 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9302 else
9303 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 break;
9305 case PyUnicode_2BYTE_KIND:
9306 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 break;
9308 case PyUnicode_4BYTE_KIND:
9309 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9310 break;
9311 default:
9312 assert(0); result = -2;
9313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 }
9315
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009316 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 PyMem_Free(buf2);
9318
9319 return result;
9320}
9321
9322Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009323_PyUnicode_InsertThousandsGrouping(
9324 PyObject *unicode, Py_ssize_t index,
9325 Py_ssize_t n_buffer,
9326 void *digits, Py_ssize_t n_digits,
9327 Py_ssize_t min_width,
9328 const char *grouping, PyObject *thousands_sep,
9329 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330{
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009332 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009333 Py_ssize_t thousands_sep_len;
9334 Py_ssize_t len;
9335
9336 if (unicode != NULL) {
9337 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009338 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009339 }
9340 else {
9341 kind = PyUnicode_1BYTE_KIND;
9342 data = NULL;
9343 }
9344 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9345 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9346 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9347 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009348 if (thousands_sep_kind < kind) {
9349 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9350 if (!thousands_sep_data)
9351 return -1;
9352 }
9353 else {
9354 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9355 if (!data)
9356 return -1;
9357 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 }
9359
Benjamin Petersonead6b532011-12-20 17:23:42 -06009360 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009362 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009366 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009367 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009369 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009371 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009375 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009377 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009381 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009383 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009384 break;
9385 default:
9386 assert(0);
9387 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009389 if (unicode != NULL && thousands_sep_kind != kind) {
9390 if (thousands_sep_kind < kind)
9391 PyMem_Free(thousands_sep_data);
9392 else
9393 PyMem_Free(data);
9394 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009395 if (unicode == NULL) {
9396 *maxchar = 127;
9397 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009398 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009399 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009400 }
9401 }
9402 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403}
9404
9405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406Py_ssize_t
9407PyUnicode_Count(PyObject *str,
9408 PyObject *substr,
9409 Py_ssize_t start,
9410 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009412 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 void *buf1 = NULL, *buf2 = NULL;
9415 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009416
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009417 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009420 kind1 = PyUnicode_KIND(str);
9421 kind2 = PyUnicode_KIND(substr);
9422 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009424
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009425 len1 = PyUnicode_GET_LENGTH(str);
9426 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009428 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009430
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 buf1 = PyUnicode_DATA(str);
9432 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009435 if (!buf2)
9436 goto onError;
9437 }
9438
9439 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009441 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009442 result = asciilib_count(
9443 ((Py_UCS1*)buf1) + start, end - start,
9444 buf2, len2, PY_SSIZE_T_MAX
9445 );
9446 else
9447 result = ucs1lib_count(
9448 ((Py_UCS1*)buf1) + start, end - start,
9449 buf2, len2, PY_SSIZE_T_MAX
9450 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 break;
9452 case PyUnicode_2BYTE_KIND:
9453 result = ucs2lib_count(
9454 ((Py_UCS2*)buf1) + start, end - start,
9455 buf2, len2, PY_SSIZE_T_MAX
9456 );
9457 break;
9458 case PyUnicode_4BYTE_KIND:
9459 result = ucs4lib_count(
9460 ((Py_UCS4*)buf1) + start, end - start,
9461 buf2, len2, PY_SSIZE_T_MAX
9462 );
9463 break;
9464 default:
9465 assert(0); result = 0;
9466 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009467
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009468 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 PyMem_Free(buf2);
9470
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009473 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 PyMem_Free(buf2);
9475 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476}
9477
Alexander Belopolsky40018472011-02-26 01:02:56 +00009478Py_ssize_t
9479PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009480 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009481 Py_ssize_t start,
9482 Py_ssize_t end,
9483 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009485 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009487
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009488 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489}
9490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491Py_ssize_t
9492PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9493 Py_ssize_t start, Py_ssize_t end,
9494 int direction)
9495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009497 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 if (PyUnicode_READY(str) == -1)
9499 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009500 if (start < 0 || end < 0) {
9501 PyErr_SetString(PyExc_IndexError, "string index out of range");
9502 return -2;
9503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 if (end > PyUnicode_GET_LENGTH(str))
9505 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009506 if (start >= end)
9507 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009509 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9510 kind, end-start, ch, direction);
9511 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009513 else
9514 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515}
9516
Alexander Belopolsky40018472011-02-26 01:02:56 +00009517static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009518tailmatch(PyObject *self,
9519 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009520 Py_ssize_t start,
9521 Py_ssize_t end,
9522 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 int kind_self;
9525 int kind_sub;
9526 void *data_self;
9527 void *data_sub;
9528 Py_ssize_t offset;
9529 Py_ssize_t i;
9530 Py_ssize_t end_sub;
9531
9532 if (PyUnicode_READY(self) == -1 ||
9533 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009534 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9537 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009541 if (PyUnicode_GET_LENGTH(substring) == 0)
9542 return 1;
9543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 kind_self = PyUnicode_KIND(self);
9545 data_self = PyUnicode_DATA(self);
9546 kind_sub = PyUnicode_KIND(substring);
9547 data_sub = PyUnicode_DATA(substring);
9548 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9549
9550 if (direction > 0)
9551 offset = end;
9552 else
9553 offset = start;
9554
9555 if (PyUnicode_READ(kind_self, data_self, offset) ==
9556 PyUnicode_READ(kind_sub, data_sub, 0) &&
9557 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9558 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9559 /* If both are of the same kind, memcmp is sufficient */
9560 if (kind_self == kind_sub) {
9561 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009562 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 data_sub,
9564 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009565 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009567 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 else {
9569 /* We do not need to compare 0 and len(substring)-1 because
9570 the if statement above ensured already that they are equal
9571 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 for (i = 1; i < end_sub; ++i) {
9573 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9574 PyUnicode_READ(kind_sub, data_sub, i))
9575 return 0;
9576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579 }
9580
9581 return 0;
9582}
9583
Alexander Belopolsky40018472011-02-26 01:02:56 +00009584Py_ssize_t
9585PyUnicode_Tailmatch(PyObject *str,
9586 PyObject *substr,
9587 Py_ssize_t start,
9588 Py_ssize_t end,
9589 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009591 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009593
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009594 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595}
9596
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597/* Apply fixfct filter to the Unicode object self and return a
9598 reference to the modified object */
9599
Alexander Belopolsky40018472011-02-26 01:02:56 +00009600static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009601fixup(PyObject *self,
9602 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 PyObject *u;
9605 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009606 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009608 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009611 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 /* fix functions return the new maximum character in a string,
9614 if the kind of the resulting unicode object does not change,
9615 everything is fine. Otherwise we need to change the string kind
9616 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009617 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009618
9619 if (maxchar_new == 0) {
9620 /* no changes */;
9621 if (PyUnicode_CheckExact(self)) {
9622 Py_DECREF(u);
9623 Py_INCREF(self);
9624 return self;
9625 }
9626 else
9627 return u;
9628 }
9629
Victor Stinnere6abb482012-05-02 01:15:40 +02009630 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631
Victor Stinnereaab6042011-12-11 22:22:39 +01009632 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009634
9635 /* In case the maximum character changed, we need to
9636 convert the string to the new category. */
9637 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9638 if (v == NULL) {
9639 Py_DECREF(u);
9640 return NULL;
9641 }
9642 if (maxchar_new > maxchar_old) {
9643 /* If the maxchar increased so that the kind changed, not all
9644 characters are representable anymore and we need to fix the
9645 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009646 _PyUnicode_FastCopyCharacters(v, 0,
9647 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009648 maxchar_old = fixfct(v);
9649 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 }
9651 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009652 _PyUnicode_FastCopyCharacters(v, 0,
9653 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009655 Py_DECREF(u);
9656 assert(_PyUnicode_CheckConsistency(v, 1));
9657 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658}
9659
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660static PyObject *
9661ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9664 char *resdata, *data = PyUnicode_DATA(self);
9665 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009666
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 res = PyUnicode_New(len, 127);
9668 if (res == NULL)
9669 return NULL;
9670 resdata = PyUnicode_DATA(res);
9671 if (lower)
9672 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 _Py_bytes_upper(resdata, data, len);
9675 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676}
9677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 Py_ssize_t j;
9682 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009683 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9687
9688 where ! is a negation and \p{xxx} is a character with property xxx.
9689 */
9690 for (j = i - 1; j >= 0; j--) {
9691 c = PyUnicode_READ(kind, data, j);
9692 if (!_PyUnicode_IsCaseIgnorable(c))
9693 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9696 if (final_sigma) {
9697 for (j = i + 1; j < length; j++) {
9698 c = PyUnicode_READ(kind, data, j);
9699 if (!_PyUnicode_IsCaseIgnorable(c))
9700 break;
9701 }
9702 final_sigma = j == length || !_PyUnicode_IsCased(c);
9703 }
9704 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705}
9706
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707static int
9708lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9709 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 /* Obscure special case. */
9712 if (c == 0x3A3) {
9713 mapped[0] = handle_capital_sigma(kind, data, length, i);
9714 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717}
9718
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719static Py_ssize_t
9720do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722 Py_ssize_t i, k = 0;
9723 int n_res, j;
9724 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009725
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 c = PyUnicode_READ(kind, data, 0);
9727 n_res = _PyUnicode_ToUpperFull(c, mapped);
9728 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009729 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 for (i = 1; i < length; i++) {
9733 c = PyUnicode_READ(kind, data, i);
9734 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9735 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009736 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009738 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009739 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741}
9742
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009743static Py_ssize_t
9744do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9745 Py_ssize_t i, k = 0;
9746
9747 for (i = 0; i < length; i++) {
9748 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9749 int n_res, j;
9750 if (Py_UNICODE_ISUPPER(c)) {
9751 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9752 }
9753 else if (Py_UNICODE_ISLOWER(c)) {
9754 n_res = _PyUnicode_ToUpperFull(c, mapped);
9755 }
9756 else {
9757 n_res = 1;
9758 mapped[0] = c;
9759 }
9760 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009761 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 res[k++] = mapped[j];
9763 }
9764 }
9765 return k;
9766}
9767
9768static Py_ssize_t
9769do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9770 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009772 Py_ssize_t i, k = 0;
9773
9774 for (i = 0; i < length; i++) {
9775 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9776 int n_res, j;
9777 if (lower)
9778 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9779 else
9780 n_res = _PyUnicode_ToUpperFull(c, mapped);
9781 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009782 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009783 res[k++] = mapped[j];
9784 }
9785 }
9786 return k;
9787}
9788
9789static Py_ssize_t
9790do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9793}
9794
9795static Py_ssize_t
9796do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9799}
9800
Benjamin Petersone51757f2012-01-12 21:10:29 -05009801static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009802do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9803{
9804 Py_ssize_t i, k = 0;
9805
9806 for (i = 0; i < length; i++) {
9807 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9808 Py_UCS4 mapped[3];
9809 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9810 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009811 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009812 res[k++] = mapped[j];
9813 }
9814 }
9815 return k;
9816}
9817
9818static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009819do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9820{
9821 Py_ssize_t i, k = 0;
9822 int previous_is_cased;
9823
9824 previous_is_cased = 0;
9825 for (i = 0; i < length; i++) {
9826 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9827 Py_UCS4 mapped[3];
9828 int n_res, j;
9829
9830 if (previous_is_cased)
9831 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9832 else
9833 n_res = _PyUnicode_ToTitleFull(c, mapped);
9834
9835 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009836 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009837 res[k++] = mapped[j];
9838 }
9839
9840 previous_is_cased = _PyUnicode_IsCased(c);
9841 }
9842 return k;
9843}
9844
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009845static PyObject *
9846case_operation(PyObject *self,
9847 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9848{
9849 PyObject *res = NULL;
9850 Py_ssize_t length, newlength = 0;
9851 int kind, outkind;
9852 void *data, *outdata;
9853 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9854
Benjamin Petersoneea48462012-01-16 14:28:50 -05009855 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009856
9857 kind = PyUnicode_KIND(self);
9858 data = PyUnicode_DATA(self);
9859 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009860 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009861 PyErr_SetString(PyExc_OverflowError, "string is too long");
9862 return NULL;
9863 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009864 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009865 if (tmp == NULL)
9866 return PyErr_NoMemory();
9867 newlength = perform(kind, data, length, tmp, &maxchar);
9868 res = PyUnicode_New(newlength, maxchar);
9869 if (res == NULL)
9870 goto leave;
9871 tmpend = tmp + newlength;
9872 outdata = PyUnicode_DATA(res);
9873 outkind = PyUnicode_KIND(res);
9874 switch (outkind) {
9875 case PyUnicode_1BYTE_KIND:
9876 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9877 break;
9878 case PyUnicode_2BYTE_KIND:
9879 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9880 break;
9881 case PyUnicode_4BYTE_KIND:
9882 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9883 break;
9884 default:
9885 assert(0);
9886 break;
9887 }
9888 leave:
9889 PyMem_FREE(tmp);
9890 return res;
9891}
9892
Tim Peters8ce9f162004-08-27 01:49:32 +00009893PyObject *
9894PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009896 PyObject *res;
9897 PyObject *fseq;
9898 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009899 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009901 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009902 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009904 }
9905
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009906 /* NOTE: the following code can't call back into Python code,
9907 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009908 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009909
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009910 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009911 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009912 res = _PyUnicode_JoinArray(separator, items, seqlen);
9913 Py_DECREF(fseq);
9914 return res;
9915}
9916
9917PyObject *
9918_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9919{
9920 PyObject *res = NULL; /* the result */
9921 PyObject *sep = NULL;
9922 Py_ssize_t seplen;
9923 PyObject *item;
9924 Py_ssize_t sz, i, res_offset;
9925 Py_UCS4 maxchar;
9926 Py_UCS4 item_maxchar;
9927 int use_memcpy;
9928 unsigned char *res_data = NULL, *sep_data = NULL;
9929 PyObject *last_obj;
9930 unsigned int kind = 0;
9931
Tim Peters05eba1f2004-08-27 21:32:02 +00009932 /* If empty sequence, return u"". */
9933 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009934 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009935 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009936
Tim Peters05eba1f2004-08-27 21:32:02 +00009937 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009938 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009939 if (seqlen == 1) {
9940 if (PyUnicode_CheckExact(items[0])) {
9941 res = items[0];
9942 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009943 return res;
9944 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009945 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009946 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009947 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009948 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009949 /* Set up sep and seplen */
9950 if (separator == NULL) {
9951 /* fall back to a blank space separator */
9952 sep = PyUnicode_FromOrdinal(' ');
9953 if (!sep)
9954 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009955 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009956 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009957 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009958 else {
9959 if (!PyUnicode_Check(separator)) {
9960 PyErr_Format(PyExc_TypeError,
9961 "separator: expected str instance,"
9962 " %.80s found",
9963 Py_TYPE(separator)->tp_name);
9964 goto onError;
9965 }
9966 if (PyUnicode_READY(separator))
9967 goto onError;
9968 sep = separator;
9969 seplen = PyUnicode_GET_LENGTH(separator);
9970 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9971 /* inc refcount to keep this code path symmetric with the
9972 above case of a blank separator */
9973 Py_INCREF(sep);
9974 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009976 }
9977
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009978 /* There are at least two things to join, or else we have a subclass
9979 * of str in the sequence.
9980 * Do a pre-pass to figure out the total amount of space we'll
9981 * need (sz), and see whether all argument are strings.
9982 */
9983 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009984#ifdef Py_DEBUG
9985 use_memcpy = 0;
9986#else
9987 use_memcpy = 1;
9988#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009989 for (i = 0; i < seqlen; i++) {
9990 const Py_ssize_t old_sz = sz;
9991 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 if (!PyUnicode_Check(item)) {
9993 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009994 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009995 " %.80s found",
9996 i, Py_TYPE(item)->tp_name);
9997 goto onError;
9998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (PyUnicode_READY(item) == -1)
10000 goto onError;
10001 sz += PyUnicode_GET_LENGTH(item);
10002 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010003 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004 if (i != 0)
10005 sz += seplen;
10006 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
10007 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 goto onError;
10010 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010011 if (use_memcpy && last_obj != NULL) {
10012 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10013 use_memcpy = 0;
10014 }
10015 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010016 }
Tim Petersced69f82003-09-16 20:30:58 +000010017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010019 if (res == NULL)
10020 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010021
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010022 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010023#ifdef Py_DEBUG
10024 use_memcpy = 0;
10025#else
10026 if (use_memcpy) {
10027 res_data = PyUnicode_1BYTE_DATA(res);
10028 kind = PyUnicode_KIND(res);
10029 if (seplen != 0)
10030 sep_data = PyUnicode_1BYTE_DATA(sep);
10031 }
10032#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010033 if (use_memcpy) {
10034 for (i = 0; i < seqlen; ++i) {
10035 Py_ssize_t itemlen;
10036 item = items[i];
10037
10038 /* Copy item, and maybe the separator. */
10039 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010040 Py_MEMCPY(res_data,
10041 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010042 kind * seplen);
10043 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010044 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010045
10046 itemlen = PyUnicode_GET_LENGTH(item);
10047 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010048 Py_MEMCPY(res_data,
10049 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010050 kind * itemlen);
10051 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010052 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010053 }
10054 assert(res_data == PyUnicode_1BYTE_DATA(res)
10055 + kind * PyUnicode_GET_LENGTH(res));
10056 }
10057 else {
10058 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10059 Py_ssize_t itemlen;
10060 item = items[i];
10061
10062 /* Copy item, and maybe the separator. */
10063 if (i && seplen != 0) {
10064 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10065 res_offset += seplen;
10066 }
10067
10068 itemlen = PyUnicode_GET_LENGTH(item);
10069 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010070 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010071 res_offset += itemlen;
10072 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010073 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010074 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010075 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010078 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010083 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084 return NULL;
10085}
10086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087#define FILL(kind, data, value, start, length) \
10088 do { \
10089 Py_ssize_t i_ = 0; \
10090 assert(kind != PyUnicode_WCHAR_KIND); \
10091 switch ((kind)) { \
10092 case PyUnicode_1BYTE_KIND: { \
10093 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010094 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 break; \
10096 } \
10097 case PyUnicode_2BYTE_KIND: { \
10098 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10099 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10100 break; \
10101 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010102 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10104 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10105 break; \
10106 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010107 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 } \
10109 } while (0)
10110
Victor Stinnerd3f08822012-05-29 12:57:52 +020010111void
10112_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10113 Py_UCS4 fill_char)
10114{
10115 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10116 const void *data = PyUnicode_DATA(unicode);
10117 assert(PyUnicode_IS_READY(unicode));
10118 assert(unicode_modifiable(unicode));
10119 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10120 assert(start >= 0);
10121 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10122 FILL(kind, data, fill_char, start, length);
10123}
10124
Victor Stinner3fe55312012-01-04 00:33:50 +010010125Py_ssize_t
10126PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10127 Py_UCS4 fill_char)
10128{
10129 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010130
10131 if (!PyUnicode_Check(unicode)) {
10132 PyErr_BadInternalCall();
10133 return -1;
10134 }
10135 if (PyUnicode_READY(unicode) == -1)
10136 return -1;
10137 if (unicode_check_modifiable(unicode))
10138 return -1;
10139
Victor Stinnerd3f08822012-05-29 12:57:52 +020010140 if (start < 0) {
10141 PyErr_SetString(PyExc_IndexError, "string index out of range");
10142 return -1;
10143 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010144 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10145 PyErr_SetString(PyExc_ValueError,
10146 "fill character is bigger than "
10147 "the string maximum character");
10148 return -1;
10149 }
10150
10151 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10152 length = Py_MIN(maxlen, length);
10153 if (length <= 0)
10154 return 0;
10155
Victor Stinnerd3f08822012-05-29 12:57:52 +020010156 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010157 return length;
10158}
10159
Victor Stinner9310abb2011-10-05 00:59:23 +020010160static PyObject *
10161pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010162 Py_ssize_t left,
10163 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 PyObject *u;
10167 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010168 int kind;
10169 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
10171 if (left < 0)
10172 left = 0;
10173 if (right < 0)
10174 right = 0;
10175
Victor Stinnerc4b49542011-12-11 22:44:26 +010010176 if (left == 0 && right == 0)
10177 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10180 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010181 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10182 return NULL;
10183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010185 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010187 if (!u)
10188 return NULL;
10189
10190 kind = PyUnicode_KIND(u);
10191 data = PyUnicode_DATA(u);
10192 if (left)
10193 FILL(kind, data, fill, 0, left);
10194 if (right)
10195 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010196 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010197 assert(_PyUnicode_CheckConsistency(u, 1));
10198 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199}
10200
Alexander Belopolsky40018472011-02-26 01:02:56 +000010201PyObject *
10202PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010206 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208
Benjamin Petersonead6b532011-12-20 17:23:42 -060010209 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211 if (PyUnicode_IS_ASCII(string))
10212 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 PyUnicode_GET_LENGTH(string), keepends);
10215 else
10216 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 break;
10220 case PyUnicode_2BYTE_KIND:
10221 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010222 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 PyUnicode_GET_LENGTH(string), keepends);
10224 break;
10225 case PyUnicode_4BYTE_KIND:
10226 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010227 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 PyUnicode_GET_LENGTH(string), keepends);
10229 break;
10230 default:
10231 assert(0);
10232 list = 0;
10233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235}
10236
Alexander Belopolsky40018472011-02-26 01:02:56 +000010237static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010238split(PyObject *self,
10239 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010240 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010242 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 void *buf1, *buf2;
10244 Py_ssize_t len1, len2;
10245 PyObject* out;
10246
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010248 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (PyUnicode_READY(self) == -1)
10251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010254 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010256 if (PyUnicode_IS_ASCII(self))
10257 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010259 PyUnicode_GET_LENGTH(self), maxcount
10260 );
10261 else
10262 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010264 PyUnicode_GET_LENGTH(self), maxcount
10265 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 case PyUnicode_2BYTE_KIND:
10267 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010268 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 PyUnicode_GET_LENGTH(self), maxcount
10270 );
10271 case PyUnicode_4BYTE_KIND:
10272 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010273 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 PyUnicode_GET_LENGTH(self), maxcount
10275 );
10276 default:
10277 assert(0);
10278 return NULL;
10279 }
10280
10281 if (PyUnicode_READY(substring) == -1)
10282 return NULL;
10283
10284 kind1 = PyUnicode_KIND(self);
10285 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 len1 = PyUnicode_GET_LENGTH(self);
10287 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010288 if (kind1 < kind2 || len1 < len2) {
10289 out = PyList_New(1);
10290 if (out == NULL)
10291 return NULL;
10292 Py_INCREF(self);
10293 PyList_SET_ITEM(out, 0, self);
10294 return out;
10295 }
10296 buf1 = PyUnicode_DATA(self);
10297 buf2 = PyUnicode_DATA(substring);
10298 if (kind2 != kind1) {
10299 buf2 = _PyUnicode_AsKind(substring, kind1);
10300 if (!buf2)
10301 return NULL;
10302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010304 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010306 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10307 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010309 else
10310 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010311 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 break;
10313 case PyUnicode_2BYTE_KIND:
10314 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010315 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 break;
10317 case PyUnicode_4BYTE_KIND:
10318 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 break;
10321 default:
10322 out = NULL;
10323 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010324 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 PyMem_Free(buf2);
10326 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327}
10328
Alexander Belopolsky40018472011-02-26 01:02:56 +000010329static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010330rsplit(PyObject *self,
10331 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010332 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010333{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010334 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 void *buf1, *buf2;
10336 Py_ssize_t len1, len2;
10337 PyObject* out;
10338
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010339 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010340 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (PyUnicode_READY(self) == -1)
10343 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010346 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 if (PyUnicode_IS_ASCII(self))
10349 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010351 PyUnicode_GET_LENGTH(self), maxcount
10352 );
10353 else
10354 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 PyUnicode_GET_LENGTH(self), maxcount
10357 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 case PyUnicode_2BYTE_KIND:
10359 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010360 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 PyUnicode_GET_LENGTH(self), maxcount
10362 );
10363 case PyUnicode_4BYTE_KIND:
10364 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010365 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 PyUnicode_GET_LENGTH(self), maxcount
10367 );
10368 default:
10369 assert(0);
10370 return NULL;
10371 }
10372
10373 if (PyUnicode_READY(substring) == -1)
10374 return NULL;
10375
10376 kind1 = PyUnicode_KIND(self);
10377 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 len1 = PyUnicode_GET_LENGTH(self);
10379 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010380 if (kind1 < kind2 || len1 < len2) {
10381 out = PyList_New(1);
10382 if (out == NULL)
10383 return NULL;
10384 Py_INCREF(self);
10385 PyList_SET_ITEM(out, 0, self);
10386 return out;
10387 }
10388 buf1 = PyUnicode_DATA(self);
10389 buf2 = PyUnicode_DATA(substring);
10390 if (kind2 != kind1) {
10391 buf2 = _PyUnicode_AsKind(substring, kind1);
10392 if (!buf2)
10393 return NULL;
10394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010396 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10399 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010401 else
10402 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010403 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 break;
10405 case PyUnicode_2BYTE_KIND:
10406 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010407 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 case PyUnicode_4BYTE_KIND:
10410 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 break;
10413 default:
10414 out = NULL;
10415 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010416 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyMem_Free(buf2);
10418 return out;
10419}
10420
10421static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10423 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010425 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10428 return asciilib_find(buf1, len1, buf2, len2, offset);
10429 else
10430 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 case PyUnicode_2BYTE_KIND:
10432 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10433 case PyUnicode_4BYTE_KIND:
10434 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10435 }
10436 assert(0);
10437 return -1;
10438}
10439
10440static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010441anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10442 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010444 switch (kind) {
10445 case PyUnicode_1BYTE_KIND:
10446 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10447 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10448 else
10449 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10450 case PyUnicode_2BYTE_KIND:
10451 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10452 case PyUnicode_4BYTE_KIND:
10453 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10454 }
10455 assert(0);
10456 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010457}
10458
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010459static void
10460replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10461 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10462{
10463 int kind = PyUnicode_KIND(u);
10464 void *data = PyUnicode_DATA(u);
10465 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10466 if (kind == PyUnicode_1BYTE_KIND) {
10467 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10468 (Py_UCS1 *)data + len,
10469 u1, u2, maxcount);
10470 }
10471 else if (kind == PyUnicode_2BYTE_KIND) {
10472 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10473 (Py_UCS2 *)data + len,
10474 u1, u2, maxcount);
10475 }
10476 else {
10477 assert(kind == PyUnicode_4BYTE_KIND);
10478 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10479 (Py_UCS4 *)data + len,
10480 u1, u2, maxcount);
10481 }
10482}
10483
Alexander Belopolsky40018472011-02-26 01:02:56 +000010484static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485replace(PyObject *self, PyObject *str1,
10486 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 PyObject *u;
10489 char *sbuf = PyUnicode_DATA(self);
10490 char *buf1 = PyUnicode_DATA(str1);
10491 char *buf2 = PyUnicode_DATA(str2);
10492 int srelease = 0, release1 = 0, release2 = 0;
10493 int skind = PyUnicode_KIND(self);
10494 int kind1 = PyUnicode_KIND(str1);
10495 int kind2 = PyUnicode_KIND(str2);
10496 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10497 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10498 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010499 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010500 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501
10502 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010505 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506
Victor Stinner59de0ee2011-10-07 10:01:28 +020010507 if (str1 == str2)
10508 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509
Victor Stinner49a0a212011-10-12 23:46:10 +020010510 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010511 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10512 if (maxchar < maxchar_str1)
10513 /* substring too wide to be present */
10514 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010515 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10516 /* Replacing str1 with str2 may cause a maxchar reduction in the
10517 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010519 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010522 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010524 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010527 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010528 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010529
Victor Stinner69ed0f42013-04-09 21:48:24 +020010530 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010531 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010532 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010534 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010536 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010538
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010539 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10540 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010541 }
10542 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 int rkind = skind;
10544 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010545 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (kind1 < rkind) {
10548 /* widen substring */
10549 buf1 = _PyUnicode_AsKind(str1, rkind);
10550 if (!buf1) goto error;
10551 release1 = 1;
10552 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010553 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010554 if (i < 0)
10555 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (rkind > kind2) {
10557 /* widen replacement */
10558 buf2 = _PyUnicode_AsKind(str2, rkind);
10559 if (!buf2) goto error;
10560 release2 = 1;
10561 }
10562 else if (rkind < kind2) {
10563 /* widen self and buf1 */
10564 rkind = kind2;
10565 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010566 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 sbuf = _PyUnicode_AsKind(self, rkind);
10568 if (!sbuf) goto error;
10569 srelease = 1;
10570 buf1 = _PyUnicode_AsKind(str1, rkind);
10571 if (!buf1) goto error;
10572 release1 = 1;
10573 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 u = PyUnicode_New(slen, maxchar);
10575 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010577 assert(PyUnicode_KIND(u) == rkind);
10578 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010579
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010581 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010584 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010586
10587 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010589 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010591 if (i == -1)
10592 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010593 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010595 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010599 }
10600 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010602 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 int rkind = skind;
10604 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 buf1 = _PyUnicode_AsKind(str1, rkind);
10609 if (!buf1) goto error;
10610 release1 = 1;
10611 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010612 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 if (n == 0)
10614 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010616 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 buf2 = _PyUnicode_AsKind(str2, rkind);
10618 if (!buf2) goto error;
10619 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010622 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 rkind = kind2;
10624 sbuf = _PyUnicode_AsKind(self, rkind);
10625 if (!sbuf) goto error;
10626 srelease = 1;
10627 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010628 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 buf1 = _PyUnicode_AsKind(str1, rkind);
10630 if (!buf1) goto error;
10631 release1 = 1;
10632 }
10633 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10634 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010635 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 PyErr_SetString(PyExc_OverflowError,
10637 "replace string is too long");
10638 goto error;
10639 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010640 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010641 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010642 _Py_INCREF_UNICODE_EMPTY();
10643 if (!unicode_empty)
10644 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010645 u = unicode_empty;
10646 goto done;
10647 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010648 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 PyErr_SetString(PyExc_OverflowError,
10650 "replace string is too long");
10651 goto error;
10652 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010653 u = PyUnicode_New(new_size, maxchar);
10654 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010656 assert(PyUnicode_KIND(u) == rkind);
10657 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 ires = i = 0;
10659 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660 while (n-- > 0) {
10661 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010662 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010663 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010664 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010665 if (j == -1)
10666 break;
10667 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010669 memcpy(res + rkind * ires,
10670 sbuf + rkind * i,
10671 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 }
10674 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010676 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010678 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010685 memcpy(res + rkind * ires,
10686 sbuf + rkind * i,
10687 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010688 }
10689 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010690 /* interleave */
10691 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010692 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010696 if (--n <= 0)
10697 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010698 memcpy(res + rkind * ires,
10699 sbuf + rkind * i,
10700 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 ires++;
10702 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010703 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010704 memcpy(res + rkind * ires,
10705 sbuf + rkind * i,
10706 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010707 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010708 }
10709
10710 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010711 unicode_adjust_maxchar(&u);
10712 if (u == NULL)
10713 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010715
10716 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (srelease)
10718 PyMem_FREE(sbuf);
10719 if (release1)
10720 PyMem_FREE(buf1);
10721 if (release2)
10722 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010723 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010727 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (srelease)
10729 PyMem_FREE(sbuf);
10730 if (release1)
10731 PyMem_FREE(buf1);
10732 if (release2)
10733 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010734 return unicode_result_unchanged(self);
10735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 error:
10737 if (srelease && sbuf)
10738 PyMem_FREE(sbuf);
10739 if (release1 && buf1)
10740 PyMem_FREE(buf1);
10741 if (release2 && buf2)
10742 PyMem_FREE(buf2);
10743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744}
10745
10746/* --- Unicode Object Methods --------------------------------------------- */
10747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750\n\
10751Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010755unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010757 if (PyUnicode_READY(self) == -1)
10758 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010759 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760}
10761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010762PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764\n\
10765Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010766have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
10768static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010769unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010771 if (PyUnicode_READY(self) == -1)
10772 return NULL;
10773 if (PyUnicode_GET_LENGTH(self) == 0)
10774 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010775 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776}
10777
Benjamin Petersond5890c82012-01-14 13:23:30 -050010778PyDoc_STRVAR(casefold__doc__,
10779 "S.casefold() -> str\n\
10780\n\
10781Return a version of S suitable for caseless comparisons.");
10782
10783static PyObject *
10784unicode_casefold(PyObject *self)
10785{
10786 if (PyUnicode_READY(self) == -1)
10787 return NULL;
10788 if (PyUnicode_IS_ASCII(self))
10789 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010790 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010791}
10792
10793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010794/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010795
10796static int
10797convert_uc(PyObject *obj, void *addr)
10798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010801 if (!PyUnicode_Check(obj)) {
10802 PyErr_Format(PyExc_TypeError,
10803 "The fill character must be a unicode character, "
10804 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 return 0;
10806 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010807 if (PyUnicode_READY(obj) < 0)
10808 return 0;
10809 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010810 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return 0;
10813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010814 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010815 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010816}
10817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010821Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010822done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
10824static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010825unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010827 Py_ssize_t marg, left;
10828 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 Py_UCS4 fillchar = ' ';
10830
Victor Stinnere9a29352011-10-01 02:14:59 +020010831 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833
Benjamin Petersonbac79492012-01-14 13:34:47 -050010834 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835 return NULL;
10836
Victor Stinnerc4b49542011-12-11 22:44:26 +010010837 if (PyUnicode_GET_LENGTH(self) >= width)
10838 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
Victor Stinnerc4b49542011-12-11 22:44:26 +010010840 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841 left = marg / 2 + (marg & width & 1);
10842
Victor Stinner9310abb2011-10-05 00:59:23 +020010843 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844}
10845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846/* This function assumes that str1 and str2 are readied by the caller. */
10847
Marc-André Lemburge5034372000-08-08 08:04:29 +000010848static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010849unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010851#define COMPARE(TYPE1, TYPE2) \
10852 do { \
10853 TYPE1* p1 = (TYPE1 *)data1; \
10854 TYPE2* p2 = (TYPE2 *)data2; \
10855 TYPE1* end = p1 + len; \
10856 Py_UCS4 c1, c2; \
10857 for (; p1 != end; p1++, p2++) { \
10858 c1 = *p1; \
10859 c2 = *p2; \
10860 if (c1 != c2) \
10861 return (c1 < c2) ? -1 : 1; \
10862 } \
10863 } \
10864 while (0)
10865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 int kind1, kind2;
10867 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010868 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 kind1 = PyUnicode_KIND(str1);
10871 kind2 = PyUnicode_KIND(str2);
10872 data1 = PyUnicode_DATA(str1);
10873 data2 = PyUnicode_DATA(str2);
10874 len1 = PyUnicode_GET_LENGTH(str1);
10875 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010876 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010877
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010878 switch(kind1) {
10879 case PyUnicode_1BYTE_KIND:
10880 {
10881 switch(kind2) {
10882 case PyUnicode_1BYTE_KIND:
10883 {
10884 int cmp = memcmp(data1, data2, len);
10885 /* normalize result of memcmp() into the range [-1; 1] */
10886 if (cmp < 0)
10887 return -1;
10888 if (cmp > 0)
10889 return 1;
10890 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010891 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010892 case PyUnicode_2BYTE_KIND:
10893 COMPARE(Py_UCS1, Py_UCS2);
10894 break;
10895 case PyUnicode_4BYTE_KIND:
10896 COMPARE(Py_UCS1, Py_UCS4);
10897 break;
10898 default:
10899 assert(0);
10900 }
10901 break;
10902 }
10903 case PyUnicode_2BYTE_KIND:
10904 {
10905 switch(kind2) {
10906 case PyUnicode_1BYTE_KIND:
10907 COMPARE(Py_UCS2, Py_UCS1);
10908 break;
10909 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010910 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010911 COMPARE(Py_UCS2, Py_UCS2);
10912 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010913 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 case PyUnicode_4BYTE_KIND:
10915 COMPARE(Py_UCS2, Py_UCS4);
10916 break;
10917 default:
10918 assert(0);
10919 }
10920 break;
10921 }
10922 case PyUnicode_4BYTE_KIND:
10923 {
10924 switch(kind2) {
10925 case PyUnicode_1BYTE_KIND:
10926 COMPARE(Py_UCS4, Py_UCS1);
10927 break;
10928 case PyUnicode_2BYTE_KIND:
10929 COMPARE(Py_UCS4, Py_UCS2);
10930 break;
10931 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010932 {
10933#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10934 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10935 /* normalize result of wmemcmp() into the range [-1; 1] */
10936 if (cmp < 0)
10937 return -1;
10938 if (cmp > 0)
10939 return 1;
10940#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010941 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010942#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010943 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010944 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010945 default:
10946 assert(0);
10947 }
10948 break;
10949 }
10950 default:
10951 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010952 }
10953
Victor Stinner770e19e2012-10-04 22:59:45 +020010954 if (len1 == len2)
10955 return 0;
10956 if (len1 < len2)
10957 return -1;
10958 else
10959 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010960
10961#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010962}
10963
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010964Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010965unicode_compare_eq(PyObject *str1, PyObject *str2)
10966{
10967 int kind;
10968 void *data1, *data2;
10969 Py_ssize_t len;
10970 int cmp;
10971
Victor Stinnere5567ad2012-10-23 02:48:49 +020010972 len = PyUnicode_GET_LENGTH(str1);
10973 if (PyUnicode_GET_LENGTH(str2) != len)
10974 return 0;
10975 kind = PyUnicode_KIND(str1);
10976 if (PyUnicode_KIND(str2) != kind)
10977 return 0;
10978 data1 = PyUnicode_DATA(str1);
10979 data2 = PyUnicode_DATA(str2);
10980
10981 cmp = memcmp(data1, data2, len * kind);
10982 return (cmp == 0);
10983}
10984
10985
Alexander Belopolsky40018472011-02-26 01:02:56 +000010986int
10987PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10990 if (PyUnicode_READY(left) == -1 ||
10991 PyUnicode_READY(right) == -1)
10992 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010993
10994 /* a string is equal to itself */
10995 if (left == right)
10996 return 0;
10997
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010998 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011000 PyErr_Format(PyExc_TypeError,
11001 "Can't compare %.100s and %.100s",
11002 left->ob_type->tp_name,
11003 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 return -1;
11005}
11006
Martin v. Löwis5b222132007-06-10 09:51:05 +000011007int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010011008_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
11009{
11010 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
11011 if (right_str == NULL)
11012 return -1;
11013 return PyUnicode_Compare(left, right_str);
11014}
11015
11016int
Martin v. Löwis5b222132007-06-10 09:51:05 +000011017PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 Py_ssize_t i;
11020 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 Py_UCS4 chr;
11022
Victor Stinner910337b2011-10-03 03:20:16 +020011023 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 if (PyUnicode_READY(uni) == -1)
11025 return -1;
11026 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011027 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011028 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011029 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011030 size_t len, len2 = strlen(str);
11031 int cmp;
11032
11033 len = Py_MIN(len1, len2);
11034 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011035 if (cmp != 0) {
11036 if (cmp < 0)
11037 return -1;
11038 else
11039 return 1;
11040 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011041 if (len1 > len2)
11042 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011043 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011044 return -1; /* str is longer */
11045 return 0;
11046 }
11047 else {
11048 void *data = PyUnicode_DATA(uni);
11049 /* Compare Unicode string and source character set string */
11050 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011051 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011052 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11053 /* This check keeps Python strings that end in '\0' from comparing equal
11054 to C strings identical up to that point. */
11055 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11056 return 1; /* uni is longer */
11057 if (str[i])
11058 return -1; /* str is longer */
11059 return 0;
11060 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011061}
11062
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011063
Benjamin Peterson29060642009-01-31 22:14:21 +000011064#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011065 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011066
Alexander Belopolsky40018472011-02-26 01:02:56 +000011067PyObject *
11068PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011069{
11070 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011071 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011072
Victor Stinnere5567ad2012-10-23 02:48:49 +020011073 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11074 Py_RETURN_NOTIMPLEMENTED;
11075
11076 if (PyUnicode_READY(left) == -1 ||
11077 PyUnicode_READY(right) == -1)
11078 return NULL;
11079
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011080 if (left == right) {
11081 switch (op) {
11082 case Py_EQ:
11083 case Py_LE:
11084 case Py_GE:
11085 /* a string is equal to itself */
11086 v = Py_True;
11087 break;
11088 case Py_NE:
11089 case Py_LT:
11090 case Py_GT:
11091 v = Py_False;
11092 break;
11093 default:
11094 PyErr_BadArgument();
11095 return NULL;
11096 }
11097 }
11098 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011099 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011100 result ^= (op == Py_NE);
11101 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011102 }
11103 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011104 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011105
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011106 /* Convert the return value to a Boolean */
11107 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011108 case Py_LE:
11109 v = TEST_COND(result <= 0);
11110 break;
11111 case Py_GE:
11112 v = TEST_COND(result >= 0);
11113 break;
11114 case Py_LT:
11115 v = TEST_COND(result == -1);
11116 break;
11117 case Py_GT:
11118 v = TEST_COND(result == 1);
11119 break;
11120 default:
11121 PyErr_BadArgument();
11122 return NULL;
11123 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011124 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011125 Py_INCREF(v);
11126 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011127}
11128
Alexander Belopolsky40018472011-02-26 01:02:56 +000011129int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011130_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11131{
11132 return unicode_eq(aa, bb);
11133}
11134
11135int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011136PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011137{
Victor Stinner77282cb2013-04-14 19:22:47 +020011138 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 void *buf1, *buf2;
11140 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011141 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011142
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011143 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011145 "'in <string>' requires string as left operand, not %.100s",
11146 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011148 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011149 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011150 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011151 if (ensure_unicode(str) < 0)
11152 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011155 kind2 = PyUnicode_KIND(substr);
11156 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011157 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011159 len2 = PyUnicode_GET_LENGTH(substr);
11160 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011161 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011162 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011163 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011164 if (len2 == 1) {
11165 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11166 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011167 return result;
11168 }
11169 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011170 buf2 = _PyUnicode_AsKind(substr, kind1);
11171 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011172 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174
Victor Stinner77282cb2013-04-14 19:22:47 +020011175 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 case PyUnicode_1BYTE_KIND:
11177 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11178 break;
11179 case PyUnicode_2BYTE_KIND:
11180 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11181 break;
11182 case PyUnicode_4BYTE_KIND:
11183 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11184 break;
11185 default:
11186 result = -1;
11187 assert(0);
11188 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011189
Victor Stinner77282cb2013-04-14 19:22:47 +020011190 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 PyMem_Free(buf2);
11192
Guido van Rossum403d68b2000-03-13 15:55:09 +000011193 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011194}
11195
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196/* Concat to string or Unicode object giving a new Unicode object. */
11197
Alexander Belopolsky40018472011-02-26 01:02:56 +000011198PyObject *
11199PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011201 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011202 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011203 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011205 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
11208 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209 if (left == unicode_empty)
11210 return PyUnicode_FromObject(right);
11211 if (right == unicode_empty)
11212 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011214 left_len = PyUnicode_GET_LENGTH(left);
11215 right_len = PyUnicode_GET_LENGTH(right);
11216 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011217 PyErr_SetString(PyExc_OverflowError,
11218 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011219 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011220 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011222
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11224 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011225 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011228 result = PyUnicode_New(new_len, maxchar);
11229 if (result == NULL)
11230 return NULL;
11231 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11232 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11233 assert(_PyUnicode_CheckConsistency(result, 1));
11234 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235}
11236
Walter Dörwald1ab83302007-05-18 17:15:44 +000011237void
Victor Stinner23e56682011-10-03 03:54:37 +020011238PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011239{
Victor Stinner23e56682011-10-03 03:54:37 +020011240 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011241 Py_UCS4 maxchar, maxchar2;
11242 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011243
11244 if (p_left == NULL) {
11245 if (!PyErr_Occurred())
11246 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011247 return;
11248 }
Victor Stinner23e56682011-10-03 03:54:37 +020011249 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011250 if (right == NULL || left == NULL
11251 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011252 if (!PyErr_Occurred())
11253 PyErr_BadInternalCall();
11254 goto error;
11255 }
11256
Benjamin Petersonbac79492012-01-14 13:34:47 -050011257 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011258 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011259 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011260 goto error;
11261
Victor Stinner488fa492011-12-12 00:01:39 +010011262 /* Shortcuts */
11263 if (left == unicode_empty) {
11264 Py_DECREF(left);
11265 Py_INCREF(right);
11266 *p_left = right;
11267 return;
11268 }
11269 if (right == unicode_empty)
11270 return;
11271
11272 left_len = PyUnicode_GET_LENGTH(left);
11273 right_len = PyUnicode_GET_LENGTH(right);
11274 if (left_len > PY_SSIZE_T_MAX - right_len) {
11275 PyErr_SetString(PyExc_OverflowError,
11276 "strings are too large to concat");
11277 goto error;
11278 }
11279 new_len = left_len + right_len;
11280
11281 if (unicode_modifiable(left)
11282 && PyUnicode_CheckExact(right)
11283 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011284 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11285 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011286 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011287 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011288 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11289 {
11290 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011291 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011292 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011293
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011294 /* copy 'right' into the newly allocated area of 'left' */
11295 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011296 }
Victor Stinner488fa492011-12-12 00:01:39 +010011297 else {
11298 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11299 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011300 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011301
Victor Stinner488fa492011-12-12 00:01:39 +010011302 /* Concat the two Unicode strings */
11303 res = PyUnicode_New(new_len, maxchar);
11304 if (res == NULL)
11305 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011306 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11307 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011308 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011309 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011310 }
11311 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011312 return;
11313
11314error:
Victor Stinner488fa492011-12-12 00:01:39 +010011315 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011316}
11317
11318void
11319PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11320{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011321 PyUnicode_Append(pleft, right);
11322 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011323}
11324
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011325/*
11326Wraps stringlib_parse_args_finds() and additionally ensures that the
11327first argument is a unicode object.
11328*/
11329
11330Py_LOCAL_INLINE(int)
11331parse_args_finds_unicode(const char * function_name, PyObject *args,
11332 PyObject **substring,
11333 Py_ssize_t *start, Py_ssize_t *end)
11334{
11335 if(stringlib_parse_args_finds(function_name, args, substring,
11336 start, end)) {
11337 if (ensure_unicode(*substring) < 0)
11338 return 0;
11339 return 1;
11340 }
11341 return 0;
11342}
11343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011344PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011345 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011347Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011348string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
11351static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011352unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011354 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011355 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011356 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011358 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 void *buf1, *buf2;
11360 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011362 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011363 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 kind1 = PyUnicode_KIND(self);
11366 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011367 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011368 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 len1 = PyUnicode_GET_LENGTH(self);
11371 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011373 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011374 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011376 buf1 = PyUnicode_DATA(self);
11377 buf2 = PyUnicode_DATA(substring);
11378 if (kind2 != kind1) {
11379 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011380 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011381 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011382 }
11383 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 case PyUnicode_1BYTE_KIND:
11385 iresult = ucs1lib_count(
11386 ((Py_UCS1*)buf1) + start, end - start,
11387 buf2, len2, PY_SSIZE_T_MAX
11388 );
11389 break;
11390 case PyUnicode_2BYTE_KIND:
11391 iresult = ucs2lib_count(
11392 ((Py_UCS2*)buf1) + start, end - start,
11393 buf2, len2, PY_SSIZE_T_MAX
11394 );
11395 break;
11396 case PyUnicode_4BYTE_KIND:
11397 iresult = ucs4lib_count(
11398 ((Py_UCS4*)buf1) + start, end - start,
11399 buf2, len2, PY_SSIZE_T_MAX
11400 );
11401 break;
11402 default:
11403 assert(0); iresult = 0;
11404 }
11405
11406 result = PyLong_FromSsize_t(iresult);
11407
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011408 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411 return result;
11412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011415 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011417Encode S using the codec registered for encoding. Default encoding\n\
11418is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011419handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011420a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11421'xmlcharrefreplace' as well as any other name registered with\n\
11422codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
11424static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011425unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011427 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428 char *encoding = NULL;
11429 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011430
Benjamin Peterson308d6372009-09-18 21:42:35 +000011431 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11432 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011434 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011435}
11436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011438 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
11440Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
11443static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011444unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011446 Py_ssize_t i, j, line_pos, src_len, incr;
11447 Py_UCS4 ch;
11448 PyObject *u;
11449 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011450 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011452 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011453 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454
Ezio Melotti745d54d2013-11-16 19:10:57 +020011455 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11456 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458
Antoine Pitrou22425222011-10-04 19:10:51 +020011459 if (PyUnicode_READY(self) == -1)
11460 return NULL;
11461
Thomas Wouters7e474022000-07-16 12:04:32 +000011462 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 src_len = PyUnicode_GET_LENGTH(self);
11464 i = j = line_pos = 0;
11465 kind = PyUnicode_KIND(self);
11466 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011467 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011468 for (; i < src_len; i++) {
11469 ch = PyUnicode_READ(kind, src_data, i);
11470 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011471 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011473 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011475 goto overflow;
11476 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011478 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011482 goto overflow;
11483 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011485 if (ch == '\n' || ch == '\r')
11486 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011488 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011489 if (!found)
11490 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011491
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011493 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 if (!u)
11495 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011496 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499
Antoine Pitroue71d5742011-10-04 15:55:09 +020011500 for (; i < src_len; i++) {
11501 ch = PyUnicode_READ(kind, src_data, i);
11502 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 incr = tabsize - (line_pos % tabsize);
11505 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011506 FILL(kind, dest_data, ' ', j, incr);
11507 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011509 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011511 line_pos++;
11512 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011513 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011514 if (ch == '\n' || ch == '\r')
11515 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011517 }
11518 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011519 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011520
Antoine Pitroue71d5742011-10-04 15:55:09 +020011521 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011522 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524}
11525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011526PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528\n\
11529Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011530such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531arguments start and end are interpreted as in slice notation.\n\
11532\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
11535static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011538 /* initialize variables to prevent gcc warning */
11539 PyObject *substring = NULL;
11540 Py_ssize_t start = 0;
11541 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011542 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011544 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011547 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011550 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 if (result == -2)
11553 return NULL;
11554
Christian Heimes217cfd12007-12-02 14:31:20 +000011555 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556}
11557
11558static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011559unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011561 void *data;
11562 enum PyUnicode_Kind kind;
11563 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011564
11565 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11566 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011568 }
11569 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11570 PyErr_SetString(PyExc_IndexError, "string index out of range");
11571 return NULL;
11572 }
11573 kind = PyUnicode_KIND(self);
11574 data = PyUnicode_DATA(self);
11575 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011576 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577}
11578
Guido van Rossumc2504932007-09-18 19:42:40 +000011579/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011580 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011581static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011582unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583{
Guido van Rossumc2504932007-09-18 19:42:40 +000011584 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011585 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011586
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011587#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011588 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011589#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 if (_PyUnicode_HASH(self) != -1)
11591 return _PyUnicode_HASH(self);
11592 if (PyUnicode_READY(self) == -1)
11593 return -1;
11594 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011595 /*
11596 We make the hash of the empty string be 0, rather than using
11597 (prefix ^ suffix), since this slightly obfuscates the hash secret
11598 */
11599 if (len == 0) {
11600 _PyUnicode_HASH(self) = 0;
11601 return 0;
11602 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011603 x = _Py_HashBytes(PyUnicode_DATA(self),
11604 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011606 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607}
11608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011610 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
11614static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011617 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011618 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011619 PyObject *substring = NULL;
11620 Py_ssize_t start = 0;
11621 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011623 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011626 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011629 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (result == -2)
11632 return NULL;
11633
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 if (result < 0) {
11635 PyErr_SetString(PyExc_ValueError, "substring not found");
11636 return NULL;
11637 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011638
Christian Heimes217cfd12007-12-02 14:31:20 +000011639 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640}
11641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011642PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011643 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011645Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011646at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647
11648static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011649unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 Py_ssize_t i, length;
11652 int kind;
11653 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654 int cased;
11655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (PyUnicode_READY(self) == -1)
11657 return NULL;
11658 length = PyUnicode_GET_LENGTH(self);
11659 kind = PyUnicode_KIND(self);
11660 data = PyUnicode_DATA(self);
11661
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (length == 1)
11664 return PyBool_FromLong(
11665 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011667 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011670
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 for (i = 0; i < length; i++) {
11673 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011674
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11676 return PyBool_FromLong(0);
11677 else if (!cased && Py_UNICODE_ISLOWER(ch))
11678 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011680 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681}
11682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011683PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011686Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011687at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688
11689static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011690unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 Py_ssize_t i, length;
11693 int kind;
11694 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695 int cased;
11696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 if (PyUnicode_READY(self) == -1)
11698 return NULL;
11699 length = PyUnicode_GET_LENGTH(self);
11700 kind = PyUnicode_KIND(self);
11701 data = PyUnicode_DATA(self);
11702
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (length == 1)
11705 return PyBool_FromLong(
11706 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011708 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011711
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 for (i = 0; i < length; i++) {
11714 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011715
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11717 return PyBool_FromLong(0);
11718 else if (!cased && Py_UNICODE_ISUPPER(ch))
11719 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011721 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722}
11723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011724PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011727Return True if S is a titlecased string and there is at least one\n\
11728character in S, i.e. upper- and titlecase characters may only\n\
11729follow uncased characters and lowercase characters only cased ones.\n\
11730Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
11732static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011733unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 Py_ssize_t i, length;
11736 int kind;
11737 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 int cased, previous_is_cased;
11739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (PyUnicode_READY(self) == -1)
11741 return NULL;
11742 length = PyUnicode_GET_LENGTH(self);
11743 kind = PyUnicode_KIND(self);
11744 data = PyUnicode_DATA(self);
11745
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (length == 1) {
11748 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11749 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11750 (Py_UNICODE_ISUPPER(ch) != 0));
11751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011753 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011756
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 cased = 0;
11758 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 for (i = 0; i < length; i++) {
11760 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011761
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11763 if (previous_is_cased)
11764 return PyBool_FromLong(0);
11765 previous_is_cased = 1;
11766 cased = 1;
11767 }
11768 else if (Py_UNICODE_ISLOWER(ch)) {
11769 if (!previous_is_cased)
11770 return PyBool_FromLong(0);
11771 previous_is_cased = 1;
11772 cased = 1;
11773 }
11774 else
11775 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011777 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778}
11779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011783Return True if all characters in S are whitespace\n\
11784and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
11786static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011787unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t i, length;
11790 int kind;
11791 void *data;
11792
11793 if (PyUnicode_READY(self) == -1)
11794 return NULL;
11795 length = PyUnicode_GET_LENGTH(self);
11796 kind = PyUnicode_KIND(self);
11797 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (length == 1)
11801 return PyBool_FromLong(
11802 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011804 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 for (i = 0; i < length; i++) {
11809 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011810 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011813 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814}
11815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011816PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011818\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011819Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011820and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011821
11822static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011823unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 Py_ssize_t i, length;
11826 int kind;
11827 void *data;
11828
11829 if (PyUnicode_READY(self) == -1)
11830 return NULL;
11831 length = PyUnicode_GET_LENGTH(self);
11832 kind = PyUnicode_KIND(self);
11833 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011835 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (length == 1)
11837 return PyBool_FromLong(
11838 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011839
11840 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 for (i = 0; i < length; i++) {
11845 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011847 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011848 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011849}
11850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011851PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011853\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011854Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011855and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011856
11857static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011858unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 int kind;
11861 void *data;
11862 Py_ssize_t len, i;
11863
11864 if (PyUnicode_READY(self) == -1)
11865 return NULL;
11866
11867 kind = PyUnicode_KIND(self);
11868 data = PyUnicode_DATA(self);
11869 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011871 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (len == 1) {
11873 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11874 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11875 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011876
11877 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 for (i = 0; i < len; i++) {
11882 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011883 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011885 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011886 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011887}
11888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011889PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011892Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011893False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894
11895static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011896unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 Py_ssize_t i, length;
11899 int kind;
11900 void *data;
11901
11902 if (PyUnicode_READY(self) == -1)
11903 return NULL;
11904 length = PyUnicode_GET_LENGTH(self);
11905 kind = PyUnicode_KIND(self);
11906 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (length == 1)
11910 return PyBool_FromLong(
11911 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011913 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 for (i = 0; i < length; i++) {
11918 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011921 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922}
11923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011924PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011927Return True if all characters in S are digits\n\
11928and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
11930static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011931unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 Py_ssize_t i, length;
11934 int kind;
11935 void *data;
11936
11937 if (PyUnicode_READY(self) == -1)
11938 return NULL;
11939 length = PyUnicode_GET_LENGTH(self);
11940 kind = PyUnicode_KIND(self);
11941 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 if (length == 1) {
11945 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11946 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011949 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 for (i = 0; i < length; i++) {
11954 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011957 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958}
11959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011960PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011963Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011964False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
11966static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011967unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 Py_ssize_t i, length;
11970 int kind;
11971 void *data;
11972
11973 if (PyUnicode_READY(self) == -1)
11974 return NULL;
11975 length = PyUnicode_GET_LENGTH(self);
11976 kind = PyUnicode_KIND(self);
11977 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 if (length == 1)
11981 return PyBool_FromLong(
11982 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011984 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 for (i = 0; i < length; i++) {
11989 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011992 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993}
11994
Martin v. Löwis47383402007-08-15 07:32:56 +000011995int
11996PyUnicode_IsIdentifier(PyObject *self)
11997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 int kind;
11999 void *data;
12000 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012001 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (PyUnicode_READY(self) == -1) {
12004 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 }
12007
12008 /* Special case for empty strings */
12009 if (PyUnicode_GET_LENGTH(self) == 0)
12010 return 0;
12011 kind = PyUnicode_KIND(self);
12012 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012013
12014 /* PEP 3131 says that the first character must be in
12015 XID_Start and subsequent characters in XID_Continue,
12016 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012017 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012018 letters, digits, underscore). However, given the current
12019 definition of XID_Start and XID_Continue, it is sufficient
12020 to check just for these, except that _ must be allowed
12021 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012023 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012024 return 0;
12025
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012026 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012029 return 1;
12030}
12031
12032PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012034\n\
12035Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012036to the language definition.\n\
12037\n\
12038Use keyword.iskeyword() to test for reserved identifiers\n\
12039such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012040
12041static PyObject*
12042unicode_isidentifier(PyObject *self)
12043{
12044 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12045}
12046
Georg Brandl559e5d72008-06-11 18:37:52 +000012047PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012049\n\
12050Return True if all characters in S are considered\n\
12051printable in repr() or S is empty, False otherwise.");
12052
12053static PyObject*
12054unicode_isprintable(PyObject *self)
12055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 Py_ssize_t i, length;
12057 int kind;
12058 void *data;
12059
12060 if (PyUnicode_READY(self) == -1)
12061 return NULL;
12062 length = PyUnicode_GET_LENGTH(self);
12063 kind = PyUnicode_KIND(self);
12064 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012065
12066 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (length == 1)
12068 return PyBool_FromLong(
12069 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 for (i = 0; i < length; i++) {
12072 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012073 Py_RETURN_FALSE;
12074 }
12075 }
12076 Py_RETURN_TRUE;
12077}
12078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012079PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012080 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081\n\
12082Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012083iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
12085static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012086unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012088 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089}
12090
Martin v. Löwis18e16552006-02-15 17:27:45 +000012091static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012092unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 if (PyUnicode_READY(self) == -1)
12095 return -1;
12096 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097}
12098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012099PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012102Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012103done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
12105static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012106unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012108 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 Py_UCS4 fillchar = ' ';
12110
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012111 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112 return NULL;
12113
Benjamin Petersonbac79492012-01-14 13:34:47 -050012114 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
Victor Stinnerc4b49542011-12-11 22:44:26 +010012117 if (PyUnicode_GET_LENGTH(self) >= width)
12118 return unicode_result_unchanged(self);
12119
12120 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121}
12122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012123PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012124 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
12128static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012129unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012131 if (PyUnicode_READY(self) == -1)
12132 return NULL;
12133 if (PyUnicode_IS_ASCII(self))
12134 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012135 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136}
12137
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138#define LEFTSTRIP 0
12139#define RIGHTSTRIP 1
12140#define BOTHSTRIP 2
12141
12142/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012143static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144
12145#define STRIPNAME(i) (stripformat[i]+3)
12146
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147/* externally visible for str.strip(unicode) */
12148PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012149_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 void *data;
12152 int kind;
12153 Py_ssize_t i, j, len;
12154 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012155 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12158 return NULL;
12159
12160 kind = PyUnicode_KIND(self);
12161 data = PyUnicode_DATA(self);
12162 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012163 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12165 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012166 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012167
Benjamin Peterson14339b62009-01-31 16:36:08 +000012168 i = 0;
12169 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012170 while (i < len) {
12171 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12172 if (!BLOOM(sepmask, ch))
12173 break;
12174 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12175 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 i++;
12177 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012178 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012179
Benjamin Peterson14339b62009-01-31 16:36:08 +000012180 j = len;
12181 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012182 j--;
12183 while (j >= i) {
12184 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12185 if (!BLOOM(sepmask, ch))
12186 break;
12187 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12188 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012189 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012190 }
12191
Benjamin Peterson29060642009-01-31 22:14:21 +000012192 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012193 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012194
Victor Stinner7931d9a2011-11-04 00:22:48 +010012195 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196}
12197
12198PyObject*
12199PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12200{
12201 unsigned char *data;
12202 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012203 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204
Victor Stinnerde636f32011-10-01 03:55:54 +020012205 if (PyUnicode_READY(self) == -1)
12206 return NULL;
12207
Victor Stinner684d5fd2012-05-03 02:32:34 +020012208 length = PyUnicode_GET_LENGTH(self);
12209 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012210
Victor Stinner684d5fd2012-05-03 02:32:34 +020012211 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012212 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213
Victor Stinnerde636f32011-10-01 03:55:54 +020012214 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012215 PyErr_SetString(PyExc_IndexError, "string index out of range");
12216 return NULL;
12217 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012218 if (start >= length || end < start)
12219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012220
Victor Stinner684d5fd2012-05-03 02:32:34 +020012221 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012222 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012223 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012224 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012225 }
12226 else {
12227 kind = PyUnicode_KIND(self);
12228 data = PyUnicode_1BYTE_DATA(self);
12229 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012230 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012231 length);
12232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234
12235static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012236do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 Py_ssize_t len, i, j;
12239
12240 if (PyUnicode_READY(self) == -1)
12241 return NULL;
12242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012244
Victor Stinnercc7af722013-04-09 22:39:24 +020012245 if (PyUnicode_IS_ASCII(self)) {
12246 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12247
12248 i = 0;
12249 if (striptype != RIGHTSTRIP) {
12250 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012251 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012252 if (!_Py_ascii_whitespace[ch])
12253 break;
12254 i++;
12255 }
12256 }
12257
12258 j = len;
12259 if (striptype != LEFTSTRIP) {
12260 j--;
12261 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012262 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012263 if (!_Py_ascii_whitespace[ch])
12264 break;
12265 j--;
12266 }
12267 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012268 }
12269 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012270 else {
12271 int kind = PyUnicode_KIND(self);
12272 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012273
Victor Stinnercc7af722013-04-09 22:39:24 +020012274 i = 0;
12275 if (striptype != RIGHTSTRIP) {
12276 while (i < len) {
12277 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12278 if (!Py_UNICODE_ISSPACE(ch))
12279 break;
12280 i++;
12281 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012282 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012283
12284 j = len;
12285 if (striptype != LEFTSTRIP) {
12286 j--;
12287 while (j >= i) {
12288 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12289 if (!Py_UNICODE_ISSPACE(ch))
12290 break;
12291 j--;
12292 }
12293 j++;
12294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012295 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296
Victor Stinner7931d9a2011-11-04 00:22:48 +010012297 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298}
12299
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012300
12301static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012302do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012303{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012304 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305
Serhiy Storchakac6792272013-10-19 21:03:34 +030012306 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012308
Benjamin Peterson14339b62009-01-31 16:36:08 +000012309 if (sep != NULL && sep != Py_None) {
12310 if (PyUnicode_Check(sep))
12311 return _PyUnicode_XStrip(self, striptype, sep);
12312 else {
12313 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 "%s arg must be None or str",
12315 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012316 return NULL;
12317 }
12318 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319
Benjamin Peterson14339b62009-01-31 16:36:08 +000012320 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012321}
12322
12323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012324PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326\n\
12327Return a copy of the string S with leading and trailing\n\
12328whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012329If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012330
12331static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012332unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012333{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012334 if (PyTuple_GET_SIZE(args) == 0)
12335 return do_strip(self, BOTHSTRIP); /* Common case */
12336 else
12337 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012338}
12339
12340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012341PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012343\n\
12344Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012345If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012346
12347static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012348unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012349{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012350 if (PyTuple_GET_SIZE(args) == 0)
12351 return do_strip(self, LEFTSTRIP); /* Common case */
12352 else
12353 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012354}
12355
12356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012357PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012359\n\
12360Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012361If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012362
12363static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012364unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012365{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012366 if (PyTuple_GET_SIZE(args) == 0)
12367 return do_strip(self, RIGHTSTRIP); /* Common case */
12368 else
12369 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012370}
12371
12372
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012374unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012376 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378
Serhiy Storchaka05997252013-01-26 12:14:02 +020012379 if (len < 1)
12380 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381
Victor Stinnerc4b49542011-12-11 22:44:26 +010012382 /* no repeat, return original string */
12383 if (len == 1)
12384 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012385
Benjamin Petersonbac79492012-01-14 13:34:47 -050012386 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 return NULL;
12388
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012389 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012390 PyErr_SetString(PyExc_OverflowError,
12391 "repeated string is too long");
12392 return NULL;
12393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012395
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012396 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397 if (!u)
12398 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012399 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 if (PyUnicode_GET_LENGTH(str) == 1) {
12402 const int kind = PyUnicode_KIND(str);
12403 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012404 if (kind == PyUnicode_1BYTE_KIND) {
12405 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012406 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012407 }
12408 else if (kind == PyUnicode_2BYTE_KIND) {
12409 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012410 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012411 ucs2[n] = fill_char;
12412 } else {
12413 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12414 assert(kind == PyUnicode_4BYTE_KIND);
12415 for (n = 0; n < len; ++n)
12416 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 }
12419 else {
12420 /* number of characters copied this far */
12421 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012422 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 char *to = (char *) PyUnicode_DATA(u);
12424 Py_MEMCPY(to, PyUnicode_DATA(str),
12425 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 n = (done <= nchars-done) ? done : nchars-done;
12428 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012429 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431 }
12432
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012433 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012434 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435}
12436
Alexander Belopolsky40018472011-02-26 01:02:56 +000012437PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012438PyUnicode_Replace(PyObject *str,
12439 PyObject *substr,
12440 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012441 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012443 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12444 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012446 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447}
12448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012449PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012450 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451\n\
12452Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012453old replaced by new. If the optional argument count is\n\
12454given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455
12456static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 PyObject *str1;
12460 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012461 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012463 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012465 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012467 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468}
12469
Alexander Belopolsky40018472011-02-26 01:02:56 +000012470static PyObject *
12471unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012473 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 Py_ssize_t isize;
12475 Py_ssize_t osize, squote, dquote, i, o;
12476 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012477 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012481 return NULL;
12482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 isize = PyUnicode_GET_LENGTH(unicode);
12484 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 /* Compute length of output, quote characters, and
12487 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012488 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 max = 127;
12490 squote = dquote = 0;
12491 ikind = PyUnicode_KIND(unicode);
12492 for (i = 0; i < isize; i++) {
12493 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012494 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012496 case '\'': squote++; break;
12497 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012499 incr = 2;
12500 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501 default:
12502 /* Fast-path ASCII */
12503 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012504 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012506 ;
12507 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012510 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012512 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012514 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012516 if (osize > PY_SSIZE_T_MAX - incr) {
12517 PyErr_SetString(PyExc_OverflowError,
12518 "string is too long to generate repr");
12519 return NULL;
12520 }
12521 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 }
12523
12524 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012525 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012527 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 if (dquote)
12529 /* Both squote and dquote present. Use squote,
12530 and escape them */
12531 osize += squote;
12532 else
12533 quote = '"';
12534 }
Victor Stinner55c08782013-04-14 18:45:39 +020012535 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536
12537 repr = PyUnicode_New(osize, max);
12538 if (repr == NULL)
12539 return NULL;
12540 okind = PyUnicode_KIND(repr);
12541 odata = PyUnicode_DATA(repr);
12542
12543 PyUnicode_WRITE(okind, odata, 0, quote);
12544 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012545 if (unchanged) {
12546 _PyUnicode_FastCopyCharacters(repr, 1,
12547 unicode, 0,
12548 isize);
12549 }
12550 else {
12551 for (i = 0, o = 1; i < isize; i++) {
12552 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553
Victor Stinner55c08782013-04-14 18:45:39 +020012554 /* Escape quotes and backslashes */
12555 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012556 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012558 continue;
12559 }
12560
12561 /* Map special whitespace to '\t', \n', '\r' */
12562 if (ch == '\t') {
12563 PyUnicode_WRITE(okind, odata, o++, '\\');
12564 PyUnicode_WRITE(okind, odata, o++, 't');
12565 }
12566 else if (ch == '\n') {
12567 PyUnicode_WRITE(okind, odata, o++, '\\');
12568 PyUnicode_WRITE(okind, odata, o++, 'n');
12569 }
12570 else if (ch == '\r') {
12571 PyUnicode_WRITE(okind, odata, o++, '\\');
12572 PyUnicode_WRITE(okind, odata, o++, 'r');
12573 }
12574
12575 /* Map non-printable US ASCII to '\xhh' */
12576 else if (ch < ' ' || ch == 0x7F) {
12577 PyUnicode_WRITE(okind, odata, o++, '\\');
12578 PyUnicode_WRITE(okind, odata, o++, 'x');
12579 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12580 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12581 }
12582
12583 /* Copy ASCII characters as-is */
12584 else if (ch < 0x7F) {
12585 PyUnicode_WRITE(okind, odata, o++, ch);
12586 }
12587
12588 /* Non-ASCII characters */
12589 else {
12590 /* Map Unicode whitespace and control characters
12591 (categories Z* and C* except ASCII space)
12592 */
12593 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12594 PyUnicode_WRITE(okind, odata, o++, '\\');
12595 /* Map 8-bit characters to '\xhh' */
12596 if (ch <= 0xff) {
12597 PyUnicode_WRITE(okind, odata, o++, 'x');
12598 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12599 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12600 }
12601 /* Map 16-bit characters to '\uxxxx' */
12602 else if (ch <= 0xffff) {
12603 PyUnicode_WRITE(okind, odata, o++, 'u');
12604 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12605 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12606 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12607 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12608 }
12609 /* Map 21-bit characters to '\U00xxxxxx' */
12610 else {
12611 PyUnicode_WRITE(okind, odata, o++, 'U');
12612 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12613 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12614 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12615 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12616 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12617 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12618 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12619 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12620 }
12621 }
12622 /* Copy characters as-is */
12623 else {
12624 PyUnicode_WRITE(okind, odata, o++, ch);
12625 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012626 }
12627 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012630 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012631 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632}
12633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012634PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636\n\
12637Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012638such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639arguments start and end are interpreted as in slice notation.\n\
12640\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012641Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642
12643static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012646 /* initialize variables to prevent gcc warning */
12647 PyObject *substring = NULL;
12648 Py_ssize_t start = 0;
12649 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012650 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012652 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012655 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012658 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 if (result == -2)
12661 return NULL;
12662
Christian Heimes217cfd12007-12-02 14:31:20 +000012663 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664}
12665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012666PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012669Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670
12671static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012674 /* initialize variables to prevent gcc warning */
12675 PyObject *substring = NULL;
12676 Py_ssize_t start = 0;
12677 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012678 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012680 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012683 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012686 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 if (result == -2)
12689 return NULL;
12690
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691 if (result < 0) {
12692 PyErr_SetString(PyExc_ValueError, "substring not found");
12693 return NULL;
12694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695
Christian Heimes217cfd12007-12-02 14:31:20 +000012696 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697}
12698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012699PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012702Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012703done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
12705static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012706unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012708 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 Py_UCS4 fillchar = ' ';
12710
Victor Stinnere9a29352011-10-01 02:14:59 +020012711 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012713
Benjamin Petersonbac79492012-01-14 13:34:47 -050012714 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715 return NULL;
12716
Victor Stinnerc4b49542011-12-11 22:44:26 +010012717 if (PyUnicode_GET_LENGTH(self) >= width)
12718 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719
Victor Stinnerc4b49542011-12-11 22:44:26 +010012720 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721}
12722
Alexander Belopolsky40018472011-02-26 01:02:56 +000012723PyObject *
12724PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012726 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012727 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012729 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730}
12731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012732PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012733 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734\n\
12735Return a list of the words in S, using sep as the\n\
12736delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012737splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012738whitespace string is a separator and empty strings are\n\
12739removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
12741static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012742unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012744 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012746 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012748 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12749 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750 return NULL;
12751
12752 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012753 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012754
12755 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012756 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012757
12758 PyErr_Format(PyExc_TypeError,
12759 "must be str or None, not %.100s",
12760 Py_TYPE(substring)->tp_name);
12761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762}
12763
Thomas Wouters477c8d52006-05-27 19:21:47 +000012764PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012765PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012766{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012768 int kind1, kind2;
12769 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012771
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012772 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012774
Victor Stinner14f8f022011-10-05 20:58:25 +020012775 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 len1 = PyUnicode_GET_LENGTH(str_obj);
12778 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012779 if (kind1 < kind2 || len1 < len2) {
12780 _Py_INCREF_UNICODE_EMPTY();
12781 if (!unicode_empty)
12782 out = NULL;
12783 else {
12784 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12785 Py_DECREF(unicode_empty);
12786 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012787 return out;
12788 }
12789 buf1 = PyUnicode_DATA(str_obj);
12790 buf2 = PyUnicode_DATA(sep_obj);
12791 if (kind2 != kind1) {
12792 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12793 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012794 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012797 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012799 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12800 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12801 else
12802 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 break;
12804 case PyUnicode_2BYTE_KIND:
12805 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12806 break;
12807 case PyUnicode_4BYTE_KIND:
12808 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12809 break;
12810 default:
12811 assert(0);
12812 out = 0;
12813 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012815 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817
12818 return out;
12819}
12820
12821
12822PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012823PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012826 int kind1, kind2;
12827 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012829
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012830 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012832
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012833 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 len1 = PyUnicode_GET_LENGTH(str_obj);
12836 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012837 if (kind1 < kind2 || len1 < len2) {
12838 _Py_INCREF_UNICODE_EMPTY();
12839 if (!unicode_empty)
12840 out = NULL;
12841 else {
12842 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12843 Py_DECREF(unicode_empty);
12844 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012845 return out;
12846 }
12847 buf1 = PyUnicode_DATA(str_obj);
12848 buf2 = PyUnicode_DATA(sep_obj);
12849 if (kind2 != kind1) {
12850 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12851 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012852 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012855 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012857 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12858 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12859 else
12860 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 break;
12862 case PyUnicode_2BYTE_KIND:
12863 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12864 break;
12865 case PyUnicode_4BYTE_KIND:
12866 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12867 break;
12868 default:
12869 assert(0);
12870 out = 0;
12871 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012872
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012873 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012875
12876 return out;
12877}
12878
12879PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012881\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012882Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012883the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012884found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012885
12886static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012887unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012888{
Victor Stinner9310abb2011-10-05 00:59:23 +020012889 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012890}
12891
12892PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012893 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012894\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012895Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012896the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012897separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012898
12899static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012900unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901{
Victor Stinner9310abb2011-10-05 00:59:23 +020012902 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012903}
12904
Alexander Belopolsky40018472011-02-26 01:02:56 +000012905PyObject *
12906PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012907{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012908 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012909 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012910
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012911 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012912}
12913
12914PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012915 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012916\n\
12917Return a list of the words in S, using sep as the\n\
12918delimiter string, starting at the end of the string and\n\
12919working to the front. If maxsplit is given, at most maxsplit\n\
12920splits are done. If sep is not specified, any whitespace string\n\
12921is a separator.");
12922
12923static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012924unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012925{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012926 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012927 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012928 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012929
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012930 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12931 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012932 return NULL;
12933
12934 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012935 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012936
12937 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012938 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012939
12940 PyErr_Format(PyExc_TypeError,
12941 "must be str or None, not %.100s",
12942 Py_TYPE(substring)->tp_name);
12943 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012944}
12945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012946PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012947 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948\n\
12949Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012950Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012951is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952
12953static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012954unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012956 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012957 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012959 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12960 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961 return NULL;
12962
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012963 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964}
12965
12966static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012967PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012969 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970}
12971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012972PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974\n\
12975Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012976and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977
12978static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012979unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012981 if (PyUnicode_READY(self) == -1)
12982 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012983 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984}
12985
Larry Hastings61272b72014-01-07 12:41:53 -080012986/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012987
Larry Hastings31826802013-10-19 00:09:25 -070012988@staticmethod
12989str.maketrans as unicode_maketrans
12990
12991 x: object
12992
12993 y: unicode=NULL
12994
12995 z: unicode=NULL
12996
12997 /
12998
12999Return a translation table usable for str.translate().
13000
13001If there is only one argument, it must be a dictionary mapping Unicode
13002ordinals (integers) or characters to Unicode ordinals, strings or None.
13003Character keys will be then converted to ordinals.
13004If there are two arguments, they must be strings of equal length, and
13005in the resulting dictionary, each character in x will be mapped to the
13006character at the same position in y. If there is a third argument, it
13007must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013008[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013009
Larry Hastings31826802013-10-19 00:09:25 -070013010static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013011unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013012/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013013{
Georg Brandlceee0772007-11-27 23:48:05 +000013014 PyObject *new = NULL, *key, *value;
13015 Py_ssize_t i = 0;
13016 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013017
Georg Brandlceee0772007-11-27 23:48:05 +000013018 new = PyDict_New();
13019 if (!new)
13020 return NULL;
13021 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 int x_kind, y_kind, z_kind;
13023 void *x_data, *y_data, *z_data;
13024
Georg Brandlceee0772007-11-27 23:48:05 +000013025 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013026 if (!PyUnicode_Check(x)) {
13027 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13028 "be a string if there is a second argument");
13029 goto err;
13030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013032 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13033 "arguments must have equal length");
13034 goto err;
13035 }
13036 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 x_kind = PyUnicode_KIND(x);
13038 y_kind = PyUnicode_KIND(y);
13039 x_data = PyUnicode_DATA(x);
13040 y_data = PyUnicode_DATA(y);
13041 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13042 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013043 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013044 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013045 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013046 if (!value) {
13047 Py_DECREF(key);
13048 goto err;
13049 }
Georg Brandlceee0772007-11-27 23:48:05 +000013050 res = PyDict_SetItem(new, key, value);
13051 Py_DECREF(key);
13052 Py_DECREF(value);
13053 if (res < 0)
13054 goto err;
13055 }
13056 /* create entries for deleting chars in z */
13057 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 z_kind = PyUnicode_KIND(z);
13059 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013060 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013062 if (!key)
13063 goto err;
13064 res = PyDict_SetItem(new, key, Py_None);
13065 Py_DECREF(key);
13066 if (res < 0)
13067 goto err;
13068 }
13069 }
13070 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 int kind;
13072 void *data;
13073
Georg Brandlceee0772007-11-27 23:48:05 +000013074 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013075 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013076 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13077 "to maketrans it must be a dict");
13078 goto err;
13079 }
13080 /* copy entries into the new dict, converting string keys to int keys */
13081 while (PyDict_Next(x, &i, &key, &value)) {
13082 if (PyUnicode_Check(key)) {
13083 /* convert string keys to integer keys */
13084 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013085 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013086 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13087 "table must be of length 1");
13088 goto err;
13089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 kind = PyUnicode_KIND(key);
13091 data = PyUnicode_DATA(key);
13092 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013093 if (!newkey)
13094 goto err;
13095 res = PyDict_SetItem(new, newkey, value);
13096 Py_DECREF(newkey);
13097 if (res < 0)
13098 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013099 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013100 /* just keep integer keys */
13101 if (PyDict_SetItem(new, key, value) < 0)
13102 goto err;
13103 } else {
13104 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13105 "be strings or integers");
13106 goto err;
13107 }
13108 }
13109 }
13110 return new;
13111 err:
13112 Py_DECREF(new);
13113 return NULL;
13114}
13115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013116PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013119Return a copy of the string S in which each character has been mapped\n\
13120through the given translation table. The table must implement\n\
13121lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13122mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13123this operation raises LookupError, the character is left untouched.\n\
13124Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
13126static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130}
13131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013132PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013135Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
13137static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013138unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013140 if (PyUnicode_READY(self) == -1)
13141 return NULL;
13142 if (PyUnicode_IS_ASCII(self))
13143 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013144 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145}
13146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013147PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013150Pad a numeric string S with zeros on the left, to fill a field\n\
13151of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152
13153static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013154unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013156 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013157 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013158 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 int kind;
13160 void *data;
13161 Py_UCS4 chr;
13162
Martin v. Löwis18e16552006-02-15 17:27:45 +000013163 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164 return NULL;
13165
Benjamin Petersonbac79492012-01-14 13:34:47 -050013166 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168
Victor Stinnerc4b49542011-12-11 22:44:26 +010013169 if (PyUnicode_GET_LENGTH(self) >= width)
13170 return unicode_result_unchanged(self);
13171
13172 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173
13174 u = pad(self, fill, 0, '0');
13175
Walter Dörwald068325e2002-04-15 13:36:47 +000013176 if (u == NULL)
13177 return NULL;
13178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 kind = PyUnicode_KIND(u);
13180 data = PyUnicode_DATA(u);
13181 chr = PyUnicode_READ(kind, data, fill);
13182
13183 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 PyUnicode_WRITE(kind, data, 0, chr);
13186 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187 }
13188
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013189 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013190 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192
13193#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013194static PyObject *
13195unicode__decimal2ascii(PyObject *self)
13196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013198}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199#endif
13200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013201PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013204Return True if S starts with the specified prefix, False otherwise.\n\
13205With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013206With optional end, stop comparing S at that position.\n\
13207prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208
13209static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013210unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013213 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013214 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013215 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013216 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218
Jesus Ceaac451502011-04-20 17:09:23 +020013219 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013221 if (PyTuple_Check(subobj)) {
13222 Py_ssize_t i;
13223 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013224 substring = PyTuple_GET_ITEM(subobj, i);
13225 if (!PyUnicode_Check(substring)) {
13226 PyErr_Format(PyExc_TypeError,
13227 "tuple for startswith must only contain str, "
13228 "not %.100s",
13229 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013230 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013231 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013232 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013233 if (result == -1)
13234 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013235 if (result) {
13236 Py_RETURN_TRUE;
13237 }
13238 }
13239 /* nothing matched */
13240 Py_RETURN_FALSE;
13241 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013242 if (!PyUnicode_Check(subobj)) {
13243 PyErr_Format(PyExc_TypeError,
13244 "startswith first arg must be str or "
13245 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013247 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013248 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013249 if (result == -1)
13250 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013251 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252}
13253
13254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013255PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013258Return True if S ends with the specified suffix, False otherwise.\n\
13259With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013260With optional end, stop comparing S at that position.\n\
13261suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262
13263static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013264unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013265 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013267 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013268 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013269 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013270 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013271 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013272
Jesus Ceaac451502011-04-20 17:09:23 +020013273 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013275 if (PyTuple_Check(subobj)) {
13276 Py_ssize_t i;
13277 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013278 substring = PyTuple_GET_ITEM(subobj, i);
13279 if (!PyUnicode_Check(substring)) {
13280 PyErr_Format(PyExc_TypeError,
13281 "tuple for endswith must only contain str, "
13282 "not %.100s",
13283 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013285 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013286 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013287 if (result == -1)
13288 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013289 if (result) {
13290 Py_RETURN_TRUE;
13291 }
13292 }
13293 Py_RETURN_FALSE;
13294 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013295 if (!PyUnicode_Check(subobj)) {
13296 PyErr_Format(PyExc_TypeError,
13297 "endswith first arg must be str or "
13298 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013300 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013301 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013302 if (result == -1)
13303 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013304 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305}
13306
Victor Stinner202fdca2012-05-07 12:47:02 +020013307Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013308_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013309{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013310 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13311 writer->data = PyUnicode_DATA(writer->buffer);
13312
13313 if (!writer->readonly) {
13314 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013315 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013316 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013317 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013318 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13319 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13320 writer->kind = PyUnicode_WCHAR_KIND;
13321 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13322
Victor Stinner8f674cc2013-04-17 23:02:17 +020013323 /* Copy-on-write mode: set buffer size to 0 so
13324 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13325 * next write. */
13326 writer->size = 0;
13327 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013328}
13329
Victor Stinnerd3f08822012-05-29 12:57:52 +020013330void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013331_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013332{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013333 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013334
13335 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013336 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013337
13338 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13339 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13340 writer->kind = PyUnicode_WCHAR_KIND;
13341 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013342}
13343
Victor Stinnerd3f08822012-05-29 12:57:52 +020013344int
13345_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13346 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013347{
13348 Py_ssize_t newlen;
13349 PyObject *newbuffer;
13350
Victor Stinnerca9381e2015-09-22 00:58:32 +020013351 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013352 assert((maxchar > writer->maxchar && length >= 0)
13353 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013354
Victor Stinner202fdca2012-05-07 12:47:02 +020013355 if (length > PY_SSIZE_T_MAX - writer->pos) {
13356 PyErr_NoMemory();
13357 return -1;
13358 }
13359 newlen = writer->pos + length;
13360
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013361 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013362
Victor Stinnerd3f08822012-05-29 12:57:52 +020013363 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013364 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013365 if (writer->overallocate
13366 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13367 /* overallocate to limit the number of realloc() */
13368 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013369 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013370 if (newlen < writer->min_length)
13371 newlen = writer->min_length;
13372
Victor Stinnerd3f08822012-05-29 12:57:52 +020013373 writer->buffer = PyUnicode_New(newlen, maxchar);
13374 if (writer->buffer == NULL)
13375 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013376 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013377 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013378 if (writer->overallocate
13379 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13380 /* overallocate to limit the number of realloc() */
13381 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013382 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013383 if (newlen < writer->min_length)
13384 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013385
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013386 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013387 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013388 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013389 newbuffer = PyUnicode_New(newlen, maxchar);
13390 if (newbuffer == NULL)
13391 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013392 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13393 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013394 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013395 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013396 }
13397 else {
13398 newbuffer = resize_compact(writer->buffer, newlen);
13399 if (newbuffer == NULL)
13400 return -1;
13401 }
13402 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013403 }
13404 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013405 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013406 newbuffer = PyUnicode_New(writer->size, maxchar);
13407 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013408 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013409 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13410 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013411 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013412 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013413 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013414 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013415
13416#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013417}
13418
Victor Stinnerca9381e2015-09-22 00:58:32 +020013419int
13420_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13421 enum PyUnicode_Kind kind)
13422{
13423 Py_UCS4 maxchar;
13424
13425 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13426 assert(writer->kind < kind);
13427
13428 switch (kind)
13429 {
13430 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13431 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13432 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13433 default:
13434 assert(0 && "invalid kind");
13435 return -1;
13436 }
13437
13438 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13439}
13440
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013441Py_LOCAL_INLINE(int)
13442_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013443{
13444 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13445 return -1;
13446 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13447 writer->pos++;
13448 return 0;
13449}
13450
13451int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013452_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13453{
13454 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13455}
13456
13457int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013458_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13459{
13460 Py_UCS4 maxchar;
13461 Py_ssize_t len;
13462
13463 if (PyUnicode_READY(str) == -1)
13464 return -1;
13465 len = PyUnicode_GET_LENGTH(str);
13466 if (len == 0)
13467 return 0;
13468 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13469 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013470 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013471 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013472 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013473 Py_INCREF(str);
13474 writer->buffer = str;
13475 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013476 writer->pos += len;
13477 return 0;
13478 }
13479 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13480 return -1;
13481 }
13482 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13483 str, 0, len);
13484 writer->pos += len;
13485 return 0;
13486}
13487
Victor Stinnere215d962012-10-06 23:03:36 +020013488int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013489_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13490 Py_ssize_t start, Py_ssize_t end)
13491{
13492 Py_UCS4 maxchar;
13493 Py_ssize_t len;
13494
13495 if (PyUnicode_READY(str) == -1)
13496 return -1;
13497
13498 assert(0 <= start);
13499 assert(end <= PyUnicode_GET_LENGTH(str));
13500 assert(start <= end);
13501
13502 if (end == 0)
13503 return 0;
13504
13505 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13506 return _PyUnicodeWriter_WriteStr(writer, str);
13507
13508 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13509 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13510 else
13511 maxchar = writer->maxchar;
13512 len = end - start;
13513
13514 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13515 return -1;
13516
13517 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13518 str, start, len);
13519 writer->pos += len;
13520 return 0;
13521}
13522
13523int
Victor Stinner4a587072013-11-19 12:54:53 +010013524_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13525 const char *ascii, Py_ssize_t len)
13526{
13527 if (len == -1)
13528 len = strlen(ascii);
13529
13530 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13531
13532 if (writer->buffer == NULL && !writer->overallocate) {
13533 PyObject *str;
13534
13535 str = _PyUnicode_FromASCII(ascii, len);
13536 if (str == NULL)
13537 return -1;
13538
13539 writer->readonly = 1;
13540 writer->buffer = str;
13541 _PyUnicodeWriter_Update(writer);
13542 writer->pos += len;
13543 return 0;
13544 }
13545
13546 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13547 return -1;
13548
13549 switch (writer->kind)
13550 {
13551 case PyUnicode_1BYTE_KIND:
13552 {
13553 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13554 Py_UCS1 *data = writer->data;
13555
13556 Py_MEMCPY(data + writer->pos, str, len);
13557 break;
13558 }
13559 case PyUnicode_2BYTE_KIND:
13560 {
13561 _PyUnicode_CONVERT_BYTES(
13562 Py_UCS1, Py_UCS2,
13563 ascii, ascii + len,
13564 (Py_UCS2 *)writer->data + writer->pos);
13565 break;
13566 }
13567 case PyUnicode_4BYTE_KIND:
13568 {
13569 _PyUnicode_CONVERT_BYTES(
13570 Py_UCS1, Py_UCS4,
13571 ascii, ascii + len,
13572 (Py_UCS4 *)writer->data + writer->pos);
13573 break;
13574 }
13575 default:
13576 assert(0);
13577 }
13578
13579 writer->pos += len;
13580 return 0;
13581}
13582
13583int
13584_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13585 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013586{
13587 Py_UCS4 maxchar;
13588
13589 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13590 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13591 return -1;
13592 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13593 writer->pos += len;
13594 return 0;
13595}
13596
Victor Stinnerd3f08822012-05-29 12:57:52 +020013597PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013598_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013599{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013600 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013601 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013602 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013603 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013604 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013605 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013606 str = writer->buffer;
13607 writer->buffer = NULL;
13608 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13609 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013610 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013611 if (writer->pos == 0) {
13612 Py_CLEAR(writer->buffer);
13613
13614 /* Get the empty Unicode string singleton ('') */
13615 _Py_INCREF_UNICODE_EMPTY();
13616 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013617 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013618 else {
13619 str = writer->buffer;
13620 writer->buffer = NULL;
13621
13622 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13623 PyObject *str2;
13624 str2 = resize_compact(str, writer->pos);
13625 if (str2 == NULL)
13626 return NULL;
13627 str = str2;
13628 }
13629 }
13630
Victor Stinner15a0bd32013-07-08 22:29:55 +020013631 assert(_PyUnicode_CheckConsistency(str, 1));
13632 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013633}
13634
Victor Stinnerd3f08822012-05-29 12:57:52 +020013635void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013636_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013637{
13638 Py_CLEAR(writer->buffer);
13639}
13640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013642
13643PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013645\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013646Return a formatted version of S, using substitutions from args and kwargs.\n\
13647The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013648
Eric Smith27bbca62010-11-04 17:06:58 +000013649PyDoc_STRVAR(format_map__doc__,
13650 "S.format_map(mapping) -> str\n\
13651\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013652Return a formatted version of S, using substitutions from mapping.\n\
13653The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013654
Eric Smith4a7d76d2008-05-30 18:10:19 +000013655static PyObject *
13656unicode__format__(PyObject* self, PyObject* args)
13657{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013658 PyObject *format_spec;
13659 _PyUnicodeWriter writer;
13660 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013661
13662 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13663 return NULL;
13664
Victor Stinnerd3f08822012-05-29 12:57:52 +020013665 if (PyUnicode_READY(self) == -1)
13666 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013667 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013668 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13669 self, format_spec, 0,
13670 PyUnicode_GET_LENGTH(format_spec));
13671 if (ret == -1) {
13672 _PyUnicodeWriter_Dealloc(&writer);
13673 return NULL;
13674 }
13675 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013676}
13677
Eric Smith8c663262007-08-25 02:26:07 +000013678PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013680\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013681Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013682
13683static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013684unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686 Py_ssize_t size;
13687
13688 /* If it's a compact object, account for base structure +
13689 character data. */
13690 if (PyUnicode_IS_COMPACT_ASCII(v))
13691 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13692 else if (PyUnicode_IS_COMPACT(v))
13693 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013694 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013695 else {
13696 /* If it is a two-block object, account for base object, and
13697 for character block if present. */
13698 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013699 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013700 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013701 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013702 }
13703 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013704 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013705 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013707 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013708 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709
13710 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013711}
13712
13713PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013715
13716static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013717unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013718{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013719 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013720 if (!copy)
13721 return NULL;
13722 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013723}
13724
Guido van Rossumd57fd912000-03-10 22:53:23 +000013725static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013726 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013727 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013728 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13729 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013730 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13731 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013732 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013733 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13734 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13735 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013736 {"expandtabs", (PyCFunction) unicode_expandtabs,
13737 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013738 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013739 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013740 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13741 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13742 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013743 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013744 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13745 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13746 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013747 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013748 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013749 {"splitlines", (PyCFunction) unicode_splitlines,
13750 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013751 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013752 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13753 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13754 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13755 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13756 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13757 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13758 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13759 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13760 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13761 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13762 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13763 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13764 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13765 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013766 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013767 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013768 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013769 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013770 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013771 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013772 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013773 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013774#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013775 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013776 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777#endif
13778
Benjamin Peterson14339b62009-01-31 16:36:08 +000013779 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013780 {NULL, NULL}
13781};
13782
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013783static PyObject *
13784unicode_mod(PyObject *v, PyObject *w)
13785{
Brian Curtindfc80e32011-08-10 20:28:54 -050013786 if (!PyUnicode_Check(v))
13787 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013788 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013789}
13790
13791static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013792 0, /*nb_add*/
13793 0, /*nb_subtract*/
13794 0, /*nb_multiply*/
13795 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013796};
13797
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013799 (lenfunc) unicode_length, /* sq_length */
13800 PyUnicode_Concat, /* sq_concat */
13801 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13802 (ssizeargfunc) unicode_getitem, /* sq_item */
13803 0, /* sq_slice */
13804 0, /* sq_ass_item */
13805 0, /* sq_ass_slice */
13806 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807};
13808
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013809static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013810unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 if (PyUnicode_READY(self) == -1)
13813 return NULL;
13814
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013815 if (PyIndex_Check(item)) {
13816 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013817 if (i == -1 && PyErr_Occurred())
13818 return NULL;
13819 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013820 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013821 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013822 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013823 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013824 PyObject *result;
13825 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013826 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013827 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013831 return NULL;
13832 }
13833
13834 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013835 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013836 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013837 slicelength == PyUnicode_GET_LENGTH(self)) {
13838 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013839 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013840 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013841 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013842 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013843 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013844 src_kind = PyUnicode_KIND(self);
13845 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013846 if (!PyUnicode_IS_ASCII(self)) {
13847 kind_limit = kind_maxchar_limit(src_kind);
13848 max_char = 0;
13849 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13850 ch = PyUnicode_READ(src_kind, src_data, cur);
13851 if (ch > max_char) {
13852 max_char = ch;
13853 if (max_char >= kind_limit)
13854 break;
13855 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013856 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013857 }
Victor Stinner55c99112011-10-13 01:17:06 +020013858 else
13859 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013860 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013861 if (result == NULL)
13862 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013863 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013864 dest_data = PyUnicode_DATA(result);
13865
13866 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013867 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13868 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013869 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013870 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013871 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013872 } else {
13873 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13874 return NULL;
13875 }
13876}
13877
13878static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013879 (lenfunc)unicode_length, /* mp_length */
13880 (binaryfunc)unicode_subscript, /* mp_subscript */
13881 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013882};
13883
Guido van Rossumd57fd912000-03-10 22:53:23 +000013884
Guido van Rossumd57fd912000-03-10 22:53:23 +000013885/* Helpers for PyUnicode_Format() */
13886
Victor Stinnera47082312012-10-04 02:19:54 +020013887struct unicode_formatter_t {
13888 PyObject *args;
13889 int args_owned;
13890 Py_ssize_t arglen, argidx;
13891 PyObject *dict;
13892
13893 enum PyUnicode_Kind fmtkind;
13894 Py_ssize_t fmtcnt, fmtpos;
13895 void *fmtdata;
13896 PyObject *fmtstr;
13897
13898 _PyUnicodeWriter writer;
13899};
13900
13901struct unicode_format_arg_t {
13902 Py_UCS4 ch;
13903 int flags;
13904 Py_ssize_t width;
13905 int prec;
13906 int sign;
13907};
13908
Guido van Rossumd57fd912000-03-10 22:53:23 +000013909static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013910unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013911{
Victor Stinnera47082312012-10-04 02:19:54 +020013912 Py_ssize_t argidx = ctx->argidx;
13913
13914 if (argidx < ctx->arglen) {
13915 ctx->argidx++;
13916 if (ctx->arglen < 0)
13917 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013918 else
Victor Stinnera47082312012-10-04 02:19:54 +020013919 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920 }
13921 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013923 return NULL;
13924}
13925
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013926/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927
Victor Stinnera47082312012-10-04 02:19:54 +020013928/* Format a float into the writer if the writer is not NULL, or into *p_output
13929 otherwise.
13930
13931 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013932static int
Victor Stinnera47082312012-10-04 02:19:54 +020013933formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13934 PyObject **p_output,
13935 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013937 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013939 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013940 int prec;
13941 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013942
Guido van Rossumd57fd912000-03-10 22:53:23 +000013943 x = PyFloat_AsDouble(v);
13944 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013945 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013946
Victor Stinnera47082312012-10-04 02:19:54 +020013947 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013949 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013950
Victor Stinnera47082312012-10-04 02:19:54 +020013951 if (arg->flags & F_ALT)
13952 dtoa_flags = Py_DTSF_ALT;
13953 else
13954 dtoa_flags = 0;
13955 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013956 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013957 return -1;
13958 len = strlen(p);
13959 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013960 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013961 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013962 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013963 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013964 }
13965 else
13966 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013967 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013968 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013969}
13970
Victor Stinnerd0880d52012-04-27 23:40:13 +020013971/* formatlong() emulates the format codes d, u, o, x and X, and
13972 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13973 * Python's regular ints.
13974 * Return value: a new PyUnicodeObject*, or NULL if error.
13975 * The output string is of the form
13976 * "-"? ("0x" | "0X")? digit+
13977 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13978 * set in flags. The case of hex digits will be correct,
13979 * There will be at least prec digits, zero-filled on the left if
13980 * necessary to get that many.
13981 * val object to be converted
13982 * flags bitmask of format flags; only F_ALT is looked at
13983 * prec minimum number of digits; 0-fill on left if needed
13984 * type a character in [duoxX]; u acts the same as d
13985 *
13986 * CAUTION: o, x and X conversions on regular ints can never
13987 * produce a '-' sign, but can for Python's unbounded ints.
13988 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013989PyObject *
13990_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013991{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013992 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013994 Py_ssize_t i;
13995 int sign; /* 1 if '-', else 0 */
13996 int len; /* number of characters */
13997 Py_ssize_t llen;
13998 int numdigits; /* len == numnondigits + numdigits */
13999 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014000
Victor Stinnerd0880d52012-04-27 23:40:13 +020014001 /* Avoid exceeding SSIZE_T_MAX */
14002 if (prec > INT_MAX-3) {
14003 PyErr_SetString(PyExc_OverflowError,
14004 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014005 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014006 }
14007
14008 assert(PyLong_Check(val));
14009
14010 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014011 default:
14012 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014013 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014014 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014015 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014016 /* int and int subclasses should print numerically when a numeric */
14017 /* format code is used (see issue18780) */
14018 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014019 break;
14020 case 'o':
14021 numnondigits = 2;
14022 result = PyNumber_ToBase(val, 8);
14023 break;
14024 case 'x':
14025 case 'X':
14026 numnondigits = 2;
14027 result = PyNumber_ToBase(val, 16);
14028 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014029 }
14030 if (!result)
14031 return NULL;
14032
14033 assert(unicode_modifiable(result));
14034 assert(PyUnicode_IS_READY(result));
14035 assert(PyUnicode_IS_ASCII(result));
14036
14037 /* To modify the string in-place, there can only be one reference. */
14038 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014039 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014040 PyErr_BadInternalCall();
14041 return NULL;
14042 }
14043 buf = PyUnicode_DATA(result);
14044 llen = PyUnicode_GET_LENGTH(result);
14045 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014046 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014047 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014048 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014049 return NULL;
14050 }
14051 len = (int)llen;
14052 sign = buf[0] == '-';
14053 numnondigits += sign;
14054 numdigits = len - numnondigits;
14055 assert(numdigits > 0);
14056
14057 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014058 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014059 (type == 'o' || type == 'x' || type == 'X'))) {
14060 assert(buf[sign] == '0');
14061 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14062 buf[sign+1] == 'o');
14063 numnondigits -= 2;
14064 buf += 2;
14065 len -= 2;
14066 if (sign)
14067 buf[0] = '-';
14068 assert(len == numnondigits + numdigits);
14069 assert(numdigits > 0);
14070 }
14071
14072 /* Fill with leading zeroes to meet minimum width. */
14073 if (prec > numdigits) {
14074 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14075 numnondigits + prec);
14076 char *b1;
14077 if (!r1) {
14078 Py_DECREF(result);
14079 return NULL;
14080 }
14081 b1 = PyBytes_AS_STRING(r1);
14082 for (i = 0; i < numnondigits; ++i)
14083 *b1++ = *buf++;
14084 for (i = 0; i < prec - numdigits; i++)
14085 *b1++ = '0';
14086 for (i = 0; i < numdigits; i++)
14087 *b1++ = *buf++;
14088 *b1 = '\0';
14089 Py_DECREF(result);
14090 result = r1;
14091 buf = PyBytes_AS_STRING(result);
14092 len = numnondigits + prec;
14093 }
14094
14095 /* Fix up case for hex conversions. */
14096 if (type == 'X') {
14097 /* Need to convert all lower case letters to upper case.
14098 and need to convert 0x to 0X (and -0x to -0X). */
14099 for (i = 0; i < len; i++)
14100 if (buf[i] >= 'a' && buf[i] <= 'x')
14101 buf[i] -= 'a'-'A';
14102 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014103 if (!PyUnicode_Check(result)
14104 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014105 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014106 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014107 Py_DECREF(result);
14108 result = unicode;
14109 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014110 else if (len != PyUnicode_GET_LENGTH(result)) {
14111 if (PyUnicode_Resize(&result, len) < 0)
14112 Py_CLEAR(result);
14113 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014114 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014115}
14116
Ethan Furmandf3ed242014-01-05 06:50:30 -080014117/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014118 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014119 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014120 * -1 and raise an exception on error */
14121static int
Victor Stinnera47082312012-10-04 02:19:54 +020014122mainformatlong(PyObject *v,
14123 struct unicode_format_arg_t *arg,
14124 PyObject **p_output,
14125 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014126{
14127 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014128 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014129
14130 if (!PyNumber_Check(v))
14131 goto wrongtype;
14132
Ethan Furman9ab74802014-03-21 06:38:46 -070014133 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014134 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014135 if (type == 'o' || type == 'x' || type == 'X') {
14136 iobj = PyNumber_Index(v);
14137 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014138 if (PyErr_ExceptionMatches(PyExc_TypeError))
14139 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014140 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014141 }
14142 }
14143 else {
14144 iobj = PyNumber_Long(v);
14145 if (iobj == NULL ) {
14146 if (PyErr_ExceptionMatches(PyExc_TypeError))
14147 goto wrongtype;
14148 return -1;
14149 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014150 }
14151 assert(PyLong_Check(iobj));
14152 }
14153 else {
14154 iobj = v;
14155 Py_INCREF(iobj);
14156 }
14157
14158 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014159 && arg->width == -1 && arg->prec == -1
14160 && !(arg->flags & (F_SIGN | F_BLANK))
14161 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014162 {
14163 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014164 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014165 int base;
14166
Victor Stinnera47082312012-10-04 02:19:54 +020014167 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014168 {
14169 default:
14170 assert(0 && "'type' not in [diuoxX]");
14171 case 'd':
14172 case 'i':
14173 case 'u':
14174 base = 10;
14175 break;
14176 case 'o':
14177 base = 8;
14178 break;
14179 case 'x':
14180 case 'X':
14181 base = 16;
14182 break;
14183 }
14184
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014185 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14186 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014187 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014188 }
14189 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014190 return 1;
14191 }
14192
Ethan Furmanb95b5612015-01-23 20:05:18 -080014193 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014194 Py_DECREF(iobj);
14195 if (res == NULL)
14196 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014197 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014198 return 0;
14199
14200wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014201 switch(type)
14202 {
14203 case 'o':
14204 case 'x':
14205 case 'X':
14206 PyErr_Format(PyExc_TypeError,
14207 "%%%c format: an integer is required, "
14208 "not %.200s",
14209 type, Py_TYPE(v)->tp_name);
14210 break;
14211 default:
14212 PyErr_Format(PyExc_TypeError,
14213 "%%%c format: a number is required, "
14214 "not %.200s",
14215 type, Py_TYPE(v)->tp_name);
14216 break;
14217 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014218 return -1;
14219}
14220
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014221static Py_UCS4
14222formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014223{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014224 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014225 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014226 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014227 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014228 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014229 goto onError;
14230 }
14231 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014232 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014233 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014234 /* make sure number is a type of integer */
14235 if (!PyLong_Check(v)) {
14236 iobj = PyNumber_Index(v);
14237 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014238 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014239 }
14240 v = iobj;
14241 Py_DECREF(iobj);
14242 }
14243 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014244 x = PyLong_AsLong(v);
14245 if (x == -1 && PyErr_Occurred())
14246 goto onError;
14247
Victor Stinner8faf8212011-12-08 22:14:11 +010014248 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014249 PyErr_SetString(PyExc_OverflowError,
14250 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014251 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014252 }
14253
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014254 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014255 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014256
Benjamin Peterson29060642009-01-31 22:14:21 +000014257 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014258 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014259 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014260 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014261}
14262
Victor Stinnera47082312012-10-04 02:19:54 +020014263/* Parse options of an argument: flags, width, precision.
14264 Handle also "%(name)" syntax.
14265
14266 Return 0 if the argument has been formatted into arg->str.
14267 Return 1 if the argument has been written into ctx->writer,
14268 Raise an exception and return -1 on error. */
14269static int
14270unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14271 struct unicode_format_arg_t *arg)
14272{
14273#define FORMAT_READ(ctx) \
14274 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14275
14276 PyObject *v;
14277
Victor Stinnera47082312012-10-04 02:19:54 +020014278 if (arg->ch == '(') {
14279 /* Get argument value from a dictionary. Example: "%(name)s". */
14280 Py_ssize_t keystart;
14281 Py_ssize_t keylen;
14282 PyObject *key;
14283 int pcount = 1;
14284
14285 if (ctx->dict == NULL) {
14286 PyErr_SetString(PyExc_TypeError,
14287 "format requires a mapping");
14288 return -1;
14289 }
14290 ++ctx->fmtpos;
14291 --ctx->fmtcnt;
14292 keystart = ctx->fmtpos;
14293 /* Skip over balanced parentheses */
14294 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14295 arg->ch = FORMAT_READ(ctx);
14296 if (arg->ch == ')')
14297 --pcount;
14298 else if (arg->ch == '(')
14299 ++pcount;
14300 ctx->fmtpos++;
14301 }
14302 keylen = ctx->fmtpos - keystart - 1;
14303 if (ctx->fmtcnt < 0 || pcount > 0) {
14304 PyErr_SetString(PyExc_ValueError,
14305 "incomplete format key");
14306 return -1;
14307 }
14308 key = PyUnicode_Substring(ctx->fmtstr,
14309 keystart, keystart + keylen);
14310 if (key == NULL)
14311 return -1;
14312 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014313 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014314 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014315 }
14316 ctx->args = PyObject_GetItem(ctx->dict, key);
14317 Py_DECREF(key);
14318 if (ctx->args == NULL)
14319 return -1;
14320 ctx->args_owned = 1;
14321 ctx->arglen = -1;
14322 ctx->argidx = -2;
14323 }
14324
14325 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014326 while (--ctx->fmtcnt >= 0) {
14327 arg->ch = FORMAT_READ(ctx);
14328 ctx->fmtpos++;
14329 switch (arg->ch) {
14330 case '-': arg->flags |= F_LJUST; continue;
14331 case '+': arg->flags |= F_SIGN; continue;
14332 case ' ': arg->flags |= F_BLANK; continue;
14333 case '#': arg->flags |= F_ALT; continue;
14334 case '0': arg->flags |= F_ZERO; continue;
14335 }
14336 break;
14337 }
14338
14339 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014340 if (arg->ch == '*') {
14341 v = unicode_format_getnextarg(ctx);
14342 if (v == NULL)
14343 return -1;
14344 if (!PyLong_Check(v)) {
14345 PyErr_SetString(PyExc_TypeError,
14346 "* wants int");
14347 return -1;
14348 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014349 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014350 if (arg->width == -1 && PyErr_Occurred())
14351 return -1;
14352 if (arg->width < 0) {
14353 arg->flags |= F_LJUST;
14354 arg->width = -arg->width;
14355 }
14356 if (--ctx->fmtcnt >= 0) {
14357 arg->ch = FORMAT_READ(ctx);
14358 ctx->fmtpos++;
14359 }
14360 }
14361 else if (arg->ch >= '0' && arg->ch <= '9') {
14362 arg->width = arg->ch - '0';
14363 while (--ctx->fmtcnt >= 0) {
14364 arg->ch = FORMAT_READ(ctx);
14365 ctx->fmtpos++;
14366 if (arg->ch < '0' || arg->ch > '9')
14367 break;
14368 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14369 mixing signed and unsigned comparison. Since arg->ch is between
14370 '0' and '9', casting to int is safe. */
14371 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14372 PyErr_SetString(PyExc_ValueError,
14373 "width too big");
14374 return -1;
14375 }
14376 arg->width = arg->width*10 + (arg->ch - '0');
14377 }
14378 }
14379
14380 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014381 if (arg->ch == '.') {
14382 arg->prec = 0;
14383 if (--ctx->fmtcnt >= 0) {
14384 arg->ch = FORMAT_READ(ctx);
14385 ctx->fmtpos++;
14386 }
14387 if (arg->ch == '*') {
14388 v = unicode_format_getnextarg(ctx);
14389 if (v == NULL)
14390 return -1;
14391 if (!PyLong_Check(v)) {
14392 PyErr_SetString(PyExc_TypeError,
14393 "* wants int");
14394 return -1;
14395 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014396 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014397 if (arg->prec == -1 && PyErr_Occurred())
14398 return -1;
14399 if (arg->prec < 0)
14400 arg->prec = 0;
14401 if (--ctx->fmtcnt >= 0) {
14402 arg->ch = FORMAT_READ(ctx);
14403 ctx->fmtpos++;
14404 }
14405 }
14406 else if (arg->ch >= '0' && arg->ch <= '9') {
14407 arg->prec = arg->ch - '0';
14408 while (--ctx->fmtcnt >= 0) {
14409 arg->ch = FORMAT_READ(ctx);
14410 ctx->fmtpos++;
14411 if (arg->ch < '0' || arg->ch > '9')
14412 break;
14413 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14414 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014415 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014416 return -1;
14417 }
14418 arg->prec = arg->prec*10 + (arg->ch - '0');
14419 }
14420 }
14421 }
14422
14423 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14424 if (ctx->fmtcnt >= 0) {
14425 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14426 if (--ctx->fmtcnt >= 0) {
14427 arg->ch = FORMAT_READ(ctx);
14428 ctx->fmtpos++;
14429 }
14430 }
14431 }
14432 if (ctx->fmtcnt < 0) {
14433 PyErr_SetString(PyExc_ValueError,
14434 "incomplete format");
14435 return -1;
14436 }
14437 return 0;
14438
14439#undef FORMAT_READ
14440}
14441
14442/* Format one argument. Supported conversion specifiers:
14443
14444 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014445 - "i", "d", "u": int or float
14446 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014447 - "e", "E", "f", "F", "g", "G": float
14448 - "c": int or str (1 character)
14449
Victor Stinner8dbd4212012-12-04 09:30:24 +010014450 When possible, the output is written directly into the Unicode writer
14451 (ctx->writer). A string is created when padding is required.
14452
Victor Stinnera47082312012-10-04 02:19:54 +020014453 Return 0 if the argument has been formatted into *p_str,
14454 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014455 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014456static int
14457unicode_format_arg_format(struct unicode_formatter_t *ctx,
14458 struct unicode_format_arg_t *arg,
14459 PyObject **p_str)
14460{
14461 PyObject *v;
14462 _PyUnicodeWriter *writer = &ctx->writer;
14463
14464 if (ctx->fmtcnt == 0)
14465 ctx->writer.overallocate = 0;
14466
14467 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014468 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014469 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014470 return 1;
14471 }
14472
14473 v = unicode_format_getnextarg(ctx);
14474 if (v == NULL)
14475 return -1;
14476
Victor Stinnera47082312012-10-04 02:19:54 +020014477
14478 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014479 case 's':
14480 case 'r':
14481 case 'a':
14482 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14483 /* Fast path */
14484 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14485 return -1;
14486 return 1;
14487 }
14488
14489 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14490 *p_str = v;
14491 Py_INCREF(*p_str);
14492 }
14493 else {
14494 if (arg->ch == 's')
14495 *p_str = PyObject_Str(v);
14496 else if (arg->ch == 'r')
14497 *p_str = PyObject_Repr(v);
14498 else
14499 *p_str = PyObject_ASCII(v);
14500 }
14501 break;
14502
14503 case 'i':
14504 case 'd':
14505 case 'u':
14506 case 'o':
14507 case 'x':
14508 case 'X':
14509 {
14510 int ret = mainformatlong(v, arg, p_str, writer);
14511 if (ret != 0)
14512 return ret;
14513 arg->sign = 1;
14514 break;
14515 }
14516
14517 case 'e':
14518 case 'E':
14519 case 'f':
14520 case 'F':
14521 case 'g':
14522 case 'G':
14523 if (arg->width == -1 && arg->prec == -1
14524 && !(arg->flags & (F_SIGN | F_BLANK)))
14525 {
14526 /* Fast path */
14527 if (formatfloat(v, arg, NULL, writer) == -1)
14528 return -1;
14529 return 1;
14530 }
14531
14532 arg->sign = 1;
14533 if (formatfloat(v, arg, p_str, NULL) == -1)
14534 return -1;
14535 break;
14536
14537 case 'c':
14538 {
14539 Py_UCS4 ch = formatchar(v);
14540 if (ch == (Py_UCS4) -1)
14541 return -1;
14542 if (arg->width == -1 && arg->prec == -1) {
14543 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014544 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014545 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014546 return 1;
14547 }
14548 *p_str = PyUnicode_FromOrdinal(ch);
14549 break;
14550 }
14551
14552 default:
14553 PyErr_Format(PyExc_ValueError,
14554 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014555 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014556 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14557 (int)arg->ch,
14558 ctx->fmtpos - 1);
14559 return -1;
14560 }
14561 if (*p_str == NULL)
14562 return -1;
14563 assert (PyUnicode_Check(*p_str));
14564 return 0;
14565}
14566
14567static int
14568unicode_format_arg_output(struct unicode_formatter_t *ctx,
14569 struct unicode_format_arg_t *arg,
14570 PyObject *str)
14571{
14572 Py_ssize_t len;
14573 enum PyUnicode_Kind kind;
14574 void *pbuf;
14575 Py_ssize_t pindex;
14576 Py_UCS4 signchar;
14577 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014578 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014579 Py_ssize_t sublen;
14580 _PyUnicodeWriter *writer = &ctx->writer;
14581 Py_UCS4 fill;
14582
14583 fill = ' ';
14584 if (arg->sign && arg->flags & F_ZERO)
14585 fill = '0';
14586
14587 if (PyUnicode_READY(str) == -1)
14588 return -1;
14589
14590 len = PyUnicode_GET_LENGTH(str);
14591 if ((arg->width == -1 || arg->width <= len)
14592 && (arg->prec == -1 || arg->prec >= len)
14593 && !(arg->flags & (F_SIGN | F_BLANK)))
14594 {
14595 /* Fast path */
14596 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14597 return -1;
14598 return 0;
14599 }
14600
14601 /* Truncate the string for "s", "r" and "a" formats
14602 if the precision is set */
14603 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14604 if (arg->prec >= 0 && len > arg->prec)
14605 len = arg->prec;
14606 }
14607
14608 /* Adjust sign and width */
14609 kind = PyUnicode_KIND(str);
14610 pbuf = PyUnicode_DATA(str);
14611 pindex = 0;
14612 signchar = '\0';
14613 if (arg->sign) {
14614 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14615 if (ch == '-' || ch == '+') {
14616 signchar = ch;
14617 len--;
14618 pindex++;
14619 }
14620 else if (arg->flags & F_SIGN)
14621 signchar = '+';
14622 else if (arg->flags & F_BLANK)
14623 signchar = ' ';
14624 else
14625 arg->sign = 0;
14626 }
14627 if (arg->width < len)
14628 arg->width = len;
14629
14630 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014631 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014632 if (!(arg->flags & F_LJUST)) {
14633 if (arg->sign) {
14634 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014635 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014636 }
14637 else {
14638 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014639 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014640 }
14641 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014642 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14643 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014644 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014645 }
14646
Victor Stinnera47082312012-10-04 02:19:54 +020014647 buflen = arg->width;
14648 if (arg->sign && len == arg->width)
14649 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014650 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014651 return -1;
14652
14653 /* Write the sign if needed */
14654 if (arg->sign) {
14655 if (fill != ' ') {
14656 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14657 writer->pos += 1;
14658 }
14659 if (arg->width > len)
14660 arg->width--;
14661 }
14662
14663 /* Write the numeric prefix for "x", "X" and "o" formats
14664 if the alternate form is used.
14665 For example, write "0x" for the "%#x" format. */
14666 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14667 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14668 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14669 if (fill != ' ') {
14670 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14671 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14672 writer->pos += 2;
14673 pindex += 2;
14674 }
14675 arg->width -= 2;
14676 if (arg->width < 0)
14677 arg->width = 0;
14678 len -= 2;
14679 }
14680
14681 /* Pad left with the fill character if needed */
14682 if (arg->width > len && !(arg->flags & F_LJUST)) {
14683 sublen = arg->width - len;
14684 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14685 writer->pos += sublen;
14686 arg->width = len;
14687 }
14688
14689 /* If padding with spaces: write sign if needed and/or numeric prefix if
14690 the alternate form is used */
14691 if (fill == ' ') {
14692 if (arg->sign) {
14693 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14694 writer->pos += 1;
14695 }
14696 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14697 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14698 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14699 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14700 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14701 writer->pos += 2;
14702 pindex += 2;
14703 }
14704 }
14705
14706 /* Write characters */
14707 if (len) {
14708 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14709 str, pindex, len);
14710 writer->pos += len;
14711 }
14712
14713 /* Pad right with the fill character if needed */
14714 if (arg->width > len) {
14715 sublen = arg->width - len;
14716 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14717 writer->pos += sublen;
14718 }
14719 return 0;
14720}
14721
14722/* Helper of PyUnicode_Format(): format one arg.
14723 Return 0 on success, raise an exception and return -1 on error. */
14724static int
14725unicode_format_arg(struct unicode_formatter_t *ctx)
14726{
14727 struct unicode_format_arg_t arg;
14728 PyObject *str;
14729 int ret;
14730
Victor Stinner8dbd4212012-12-04 09:30:24 +010014731 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14732 arg.flags = 0;
14733 arg.width = -1;
14734 arg.prec = -1;
14735 arg.sign = 0;
14736 str = NULL;
14737
Victor Stinnera47082312012-10-04 02:19:54 +020014738 ret = unicode_format_arg_parse(ctx, &arg);
14739 if (ret == -1)
14740 return -1;
14741
14742 ret = unicode_format_arg_format(ctx, &arg, &str);
14743 if (ret == -1)
14744 return -1;
14745
14746 if (ret != 1) {
14747 ret = unicode_format_arg_output(ctx, &arg, str);
14748 Py_DECREF(str);
14749 if (ret == -1)
14750 return -1;
14751 }
14752
14753 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14754 PyErr_SetString(PyExc_TypeError,
14755 "not all arguments converted during string formatting");
14756 return -1;
14757 }
14758 return 0;
14759}
14760
Alexander Belopolsky40018472011-02-26 01:02:56 +000014761PyObject *
14762PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014763{
Victor Stinnera47082312012-10-04 02:19:54 +020014764 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014765
Guido van Rossumd57fd912000-03-10 22:53:23 +000014766 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014767 PyErr_BadInternalCall();
14768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014769 }
Victor Stinnera47082312012-10-04 02:19:54 +020014770
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014771 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014772 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014773
14774 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014775 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14776 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14777 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14778 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014779
Victor Stinner8f674cc2013-04-17 23:02:17 +020014780 _PyUnicodeWriter_Init(&ctx.writer);
14781 ctx.writer.min_length = ctx.fmtcnt + 100;
14782 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014783
Guido van Rossumd57fd912000-03-10 22:53:23 +000014784 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014785 ctx.arglen = PyTuple_Size(args);
14786 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014787 }
14788 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014789 ctx.arglen = -1;
14790 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014791 }
Victor Stinnera47082312012-10-04 02:19:54 +020014792 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014793 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014794 ctx.dict = args;
14795 else
14796 ctx.dict = NULL;
14797 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014798
Victor Stinnera47082312012-10-04 02:19:54 +020014799 while (--ctx.fmtcnt >= 0) {
14800 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014801 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014802
14803 nonfmtpos = ctx.fmtpos++;
14804 while (ctx.fmtcnt >= 0 &&
14805 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14806 ctx.fmtpos++;
14807 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014808 }
Victor Stinnera47082312012-10-04 02:19:54 +020014809 if (ctx.fmtcnt < 0) {
14810 ctx.fmtpos--;
14811 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014812 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014813
Victor Stinnercfc4c132013-04-03 01:48:39 +020014814 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14815 nonfmtpos, ctx.fmtpos) < 0)
14816 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014817 }
14818 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014819 ctx.fmtpos++;
14820 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014821 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014822 }
14823 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014824
Victor Stinnera47082312012-10-04 02:19:54 +020014825 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014826 PyErr_SetString(PyExc_TypeError,
14827 "not all arguments converted during string formatting");
14828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014829 }
14830
Victor Stinnera47082312012-10-04 02:19:54 +020014831 if (ctx.args_owned) {
14832 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014833 }
Victor Stinnera47082312012-10-04 02:19:54 +020014834 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014835
Benjamin Peterson29060642009-01-31 22:14:21 +000014836 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014837 _PyUnicodeWriter_Dealloc(&ctx.writer);
14838 if (ctx.args_owned) {
14839 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014840 }
14841 return NULL;
14842}
14843
Jeremy Hylton938ace62002-07-17 16:30:39 +000014844static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014845unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14846
Tim Peters6d6c1a32001-08-02 04:15:00 +000014847static PyObject *
14848unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14849{
Benjamin Peterson29060642009-01-31 22:14:21 +000014850 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014851 static char *kwlist[] = {"object", "encoding", "errors", 0};
14852 char *encoding = NULL;
14853 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014854
Benjamin Peterson14339b62009-01-31 16:36:08 +000014855 if (type != &PyUnicode_Type)
14856 return unicode_subtype_new(type, args, kwds);
14857 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014858 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014859 return NULL;
14860 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014861 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014862 if (encoding == NULL && errors == NULL)
14863 return PyObject_Str(x);
14864 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014865 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014866}
14867
Guido van Rossume023fe02001-08-30 03:12:59 +000014868static PyObject *
14869unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14870{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014871 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014872 Py_ssize_t length, char_size;
14873 int share_wstr, share_utf8;
14874 unsigned int kind;
14875 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014876
Benjamin Peterson14339b62009-01-31 16:36:08 +000014877 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014878
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014879 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014880 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014881 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014882 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014883 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014884 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014885 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014886 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014887
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014888 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014889 if (self == NULL) {
14890 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014891 return NULL;
14892 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014893 kind = PyUnicode_KIND(unicode);
14894 length = PyUnicode_GET_LENGTH(unicode);
14895
14896 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014897#ifdef Py_DEBUG
14898 _PyUnicode_HASH(self) = -1;
14899#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014900 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014901#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014902 _PyUnicode_STATE(self).interned = 0;
14903 _PyUnicode_STATE(self).kind = kind;
14904 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014905 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014906 _PyUnicode_STATE(self).ready = 1;
14907 _PyUnicode_WSTR(self) = NULL;
14908 _PyUnicode_UTF8_LENGTH(self) = 0;
14909 _PyUnicode_UTF8(self) = NULL;
14910 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014911 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014912
14913 share_utf8 = 0;
14914 share_wstr = 0;
14915 if (kind == PyUnicode_1BYTE_KIND) {
14916 char_size = 1;
14917 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14918 share_utf8 = 1;
14919 }
14920 else if (kind == PyUnicode_2BYTE_KIND) {
14921 char_size = 2;
14922 if (sizeof(wchar_t) == 2)
14923 share_wstr = 1;
14924 }
14925 else {
14926 assert(kind == PyUnicode_4BYTE_KIND);
14927 char_size = 4;
14928 if (sizeof(wchar_t) == 4)
14929 share_wstr = 1;
14930 }
14931
14932 /* Ensure we won't overflow the length. */
14933 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14934 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014935 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014936 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014937 data = PyObject_MALLOC((length + 1) * char_size);
14938 if (data == NULL) {
14939 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014940 goto onError;
14941 }
14942
Victor Stinnerc3c74152011-10-02 20:39:55 +020014943 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014944 if (share_utf8) {
14945 _PyUnicode_UTF8_LENGTH(self) = length;
14946 _PyUnicode_UTF8(self) = data;
14947 }
14948 if (share_wstr) {
14949 _PyUnicode_WSTR_LENGTH(self) = length;
14950 _PyUnicode_WSTR(self) = (wchar_t *)data;
14951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014952
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014953 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014954 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014955 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014956#ifdef Py_DEBUG
14957 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14958#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014959 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014960 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014961
14962onError:
14963 Py_DECREF(unicode);
14964 Py_DECREF(self);
14965 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014966}
14967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014968PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014969"str(object='') -> str\n\
14970str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014971\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014972Create a new string object from the given object. If encoding or\n\
14973errors is specified, then the object must expose a data buffer\n\
14974that will be decoded using the given encoding and error handler.\n\
14975Otherwise, returns the result of object.__str__() (if defined)\n\
14976or repr(object).\n\
14977encoding defaults to sys.getdefaultencoding().\n\
14978errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014979
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014980static PyObject *unicode_iter(PyObject *seq);
14981
Guido van Rossumd57fd912000-03-10 22:53:23 +000014982PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014983 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014984 "str", /* tp_name */
14985 sizeof(PyUnicodeObject), /* tp_size */
14986 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014987 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014988 (destructor)unicode_dealloc, /* tp_dealloc */
14989 0, /* tp_print */
14990 0, /* tp_getattr */
14991 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014992 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014993 unicode_repr, /* tp_repr */
14994 &unicode_as_number, /* tp_as_number */
14995 &unicode_as_sequence, /* tp_as_sequence */
14996 &unicode_as_mapping, /* tp_as_mapping */
14997 (hashfunc) unicode_hash, /* tp_hash*/
14998 0, /* tp_call*/
14999 (reprfunc) unicode_str, /* tp_str */
15000 PyObject_GenericGetAttr, /* tp_getattro */
15001 0, /* tp_setattro */
15002 0, /* tp_as_buffer */
15003 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015004 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015005 unicode_doc, /* tp_doc */
15006 0, /* tp_traverse */
15007 0, /* tp_clear */
15008 PyUnicode_RichCompare, /* tp_richcompare */
15009 0, /* tp_weaklistoffset */
15010 unicode_iter, /* tp_iter */
15011 0, /* tp_iternext */
15012 unicode_methods, /* tp_methods */
15013 0, /* tp_members */
15014 0, /* tp_getset */
15015 &PyBaseObject_Type, /* tp_base */
15016 0, /* tp_dict */
15017 0, /* tp_descr_get */
15018 0, /* tp_descr_set */
15019 0, /* tp_dictoffset */
15020 0, /* tp_init */
15021 0, /* tp_alloc */
15022 unicode_new, /* tp_new */
15023 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015024};
15025
15026/* Initialize the Unicode implementation */
15027
Victor Stinner3a50e702011-10-18 21:21:00 +020015028int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015029{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015030 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015031 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015032 0x000A, /* LINE FEED */
15033 0x000D, /* CARRIAGE RETURN */
15034 0x001C, /* FILE SEPARATOR */
15035 0x001D, /* GROUP SEPARATOR */
15036 0x001E, /* RECORD SEPARATOR */
15037 0x0085, /* NEXT LINE */
15038 0x2028, /* LINE SEPARATOR */
15039 0x2029, /* PARAGRAPH SEPARATOR */
15040 };
15041
Fred Drakee4315f52000-05-09 19:53:39 +000015042 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015043 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015044 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015045 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015046 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015047
Guido van Rossumcacfc072002-05-24 19:01:59 +000015048 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015049 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015050
15051 /* initialize the linebreak bloom filter */
15052 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015053 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015054 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015055
Christian Heimes26532f72013-07-20 14:57:16 +020015056 if (PyType_Ready(&EncodingMapType) < 0)
15057 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015058
Benjamin Petersonc4311282012-10-30 23:21:10 -040015059 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15060 Py_FatalError("Can't initialize field name iterator type");
15061
15062 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15063 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015064
Victor Stinner3a50e702011-10-18 21:21:00 +020015065 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015066}
15067
15068/* Finalize the Unicode implementation */
15069
Christian Heimesa156e092008-02-16 07:38:31 +000015070int
15071PyUnicode_ClearFreeList(void)
15072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015073 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015074}
15075
Guido van Rossumd57fd912000-03-10 22:53:23 +000015076void
Thomas Wouters78890102000-07-22 19:25:51 +000015077_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015078{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015079 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015080
Serhiy Storchaka05997252013-01-26 12:14:02 +020015081 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015082
Serhiy Storchaka05997252013-01-26 12:14:02 +020015083 for (i = 0; i < 256; i++)
15084 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015085 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015086 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015087}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015088
Walter Dörwald16807132007-05-25 13:52:07 +000015089void
15090PyUnicode_InternInPlace(PyObject **p)
15091{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015092 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015094#ifdef Py_DEBUG
15095 assert(s != NULL);
15096 assert(_PyUnicode_CHECK(s));
15097#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015099 return;
15100#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 /* If it's a subclass, we don't really know what putting
15102 it in the interned dict might do. */
15103 if (!PyUnicode_CheckExact(s))
15104 return;
15105 if (PyUnicode_CHECK_INTERNED(s))
15106 return;
15107 if (interned == NULL) {
15108 interned = PyDict_New();
15109 if (interned == NULL) {
15110 PyErr_Clear(); /* Don't leave an exception */
15111 return;
15112 }
15113 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015115 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015117 if (t == NULL) {
15118 PyErr_Clear();
15119 return;
15120 }
15121 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015122 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015123 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015124 return;
15125 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015126 /* The two references in interned are not counted by refcnt.
15127 The deallocator will take care of this */
15128 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015129 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015130}
15131
15132void
15133PyUnicode_InternImmortal(PyObject **p)
15134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 PyUnicode_InternInPlace(p);
15136 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015137 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015138 Py_INCREF(*p);
15139 }
Walter Dörwald16807132007-05-25 13:52:07 +000015140}
15141
15142PyObject *
15143PyUnicode_InternFromString(const char *cp)
15144{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015145 PyObject *s = PyUnicode_FromString(cp);
15146 if (s == NULL)
15147 return NULL;
15148 PyUnicode_InternInPlace(&s);
15149 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015150}
15151
Alexander Belopolsky40018472011-02-26 01:02:56 +000015152void
15153_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015154{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015156 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 Py_ssize_t i, n;
15158 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015159
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 if (interned == NULL || !PyDict_Check(interned))
15161 return;
15162 keys = PyDict_Keys(interned);
15163 if (keys == NULL || !PyList_Check(keys)) {
15164 PyErr_Clear();
15165 return;
15166 }
Walter Dörwald16807132007-05-25 13:52:07 +000015167
Benjamin Peterson14339b62009-01-31 16:36:08 +000015168 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15169 detector, interned unicode strings are not forcibly deallocated;
15170 rather, we give them their stolen references back, and then clear
15171 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015172
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 n = PyList_GET_SIZE(keys);
15174 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015175 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015177 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015178 if (PyUnicode_READY(s) == -1) {
15179 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015180 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015182 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015183 case SSTATE_NOT_INTERNED:
15184 /* XXX Shouldn't happen */
15185 break;
15186 case SSTATE_INTERNED_IMMORTAL:
15187 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015188 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015189 break;
15190 case SSTATE_INTERNED_MORTAL:
15191 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015192 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015193 break;
15194 default:
15195 Py_FatalError("Inconsistent interned string state.");
15196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015197 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015198 }
15199 fprintf(stderr, "total size of all interned strings: "
15200 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15201 "mortal/immortal\n", mortal_size, immortal_size);
15202 Py_DECREF(keys);
15203 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015204 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015205}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015206
15207
15208/********************* Unicode Iterator **************************/
15209
15210typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 PyObject_HEAD
15212 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015213 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015214} unicodeiterobject;
15215
15216static void
15217unicodeiter_dealloc(unicodeiterobject *it)
15218{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015219 _PyObject_GC_UNTRACK(it);
15220 Py_XDECREF(it->it_seq);
15221 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015222}
15223
15224static int
15225unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 Py_VISIT(it->it_seq);
15228 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015229}
15230
15231static PyObject *
15232unicodeiter_next(unicodeiterobject *it)
15233{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015234 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015235
Benjamin Peterson14339b62009-01-31 16:36:08 +000015236 assert(it != NULL);
15237 seq = it->it_seq;
15238 if (seq == NULL)
15239 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015240 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015242 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15243 int kind = PyUnicode_KIND(seq);
15244 void *data = PyUnicode_DATA(seq);
15245 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15246 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015247 if (item != NULL)
15248 ++it->it_index;
15249 return item;
15250 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015251
Benjamin Peterson14339b62009-01-31 16:36:08 +000015252 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015253 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015254 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015255}
15256
15257static PyObject *
15258unicodeiter_len(unicodeiterobject *it)
15259{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015260 Py_ssize_t len = 0;
15261 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015262 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015264}
15265
15266PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15267
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015268static PyObject *
15269unicodeiter_reduce(unicodeiterobject *it)
15270{
15271 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015272 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015273 it->it_seq, it->it_index);
15274 } else {
15275 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15276 if (u == NULL)
15277 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015278 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015279 }
15280}
15281
15282PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15283
15284static PyObject *
15285unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15286{
15287 Py_ssize_t index = PyLong_AsSsize_t(state);
15288 if (index == -1 && PyErr_Occurred())
15289 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015290 if (it->it_seq != NULL) {
15291 if (index < 0)
15292 index = 0;
15293 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15294 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15295 it->it_index = index;
15296 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015297 Py_RETURN_NONE;
15298}
15299
15300PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15301
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015302static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015304 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015305 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15306 reduce_doc},
15307 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15308 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015310};
15311
15312PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015313 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15314 "str_iterator", /* tp_name */
15315 sizeof(unicodeiterobject), /* tp_basicsize */
15316 0, /* tp_itemsize */
15317 /* methods */
15318 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15319 0, /* tp_print */
15320 0, /* tp_getattr */
15321 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015322 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 0, /* tp_repr */
15324 0, /* tp_as_number */
15325 0, /* tp_as_sequence */
15326 0, /* tp_as_mapping */
15327 0, /* tp_hash */
15328 0, /* tp_call */
15329 0, /* tp_str */
15330 PyObject_GenericGetAttr, /* tp_getattro */
15331 0, /* tp_setattro */
15332 0, /* tp_as_buffer */
15333 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15334 0, /* tp_doc */
15335 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15336 0, /* tp_clear */
15337 0, /* tp_richcompare */
15338 0, /* tp_weaklistoffset */
15339 PyObject_SelfIter, /* tp_iter */
15340 (iternextfunc)unicodeiter_next, /* tp_iternext */
15341 unicodeiter_methods, /* tp_methods */
15342 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015343};
15344
15345static PyObject *
15346unicode_iter(PyObject *seq)
15347{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015349
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 if (!PyUnicode_Check(seq)) {
15351 PyErr_BadInternalCall();
15352 return NULL;
15353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015354 if (PyUnicode_READY(seq) == -1)
15355 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15357 if (it == NULL)
15358 return NULL;
15359 it->it_index = 0;
15360 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015361 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 _PyObject_GC_TRACK(it);
15363 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015364}
15365
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015366
15367size_t
15368Py_UNICODE_strlen(const Py_UNICODE *u)
15369{
15370 int res = 0;
15371 while(*u++)
15372 res++;
15373 return res;
15374}
15375
15376Py_UNICODE*
15377Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15378{
15379 Py_UNICODE *u = s1;
15380 while ((*u++ = *s2++));
15381 return s1;
15382}
15383
15384Py_UNICODE*
15385Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15386{
15387 Py_UNICODE *u = s1;
15388 while ((*u++ = *s2++))
15389 if (n-- == 0)
15390 break;
15391 return s1;
15392}
15393
15394Py_UNICODE*
15395Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15396{
15397 Py_UNICODE *u1 = s1;
15398 u1 += Py_UNICODE_strlen(u1);
15399 Py_UNICODE_strcpy(u1, s2);
15400 return s1;
15401}
15402
15403int
15404Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15405{
15406 while (*s1 && *s2 && *s1 == *s2)
15407 s1++, s2++;
15408 if (*s1 && *s2)
15409 return (*s1 < *s2) ? -1 : +1;
15410 if (*s1)
15411 return 1;
15412 if (*s2)
15413 return -1;
15414 return 0;
15415}
15416
15417int
15418Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15419{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015420 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015421 for (; n != 0; n--) {
15422 u1 = *s1;
15423 u2 = *s2;
15424 if (u1 != u2)
15425 return (u1 < u2) ? -1 : +1;
15426 if (u1 == '\0')
15427 return 0;
15428 s1++;
15429 s2++;
15430 }
15431 return 0;
15432}
15433
15434Py_UNICODE*
15435Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15436{
15437 const Py_UNICODE *p;
15438 for (p = s; *p; p++)
15439 if (*p == c)
15440 return (Py_UNICODE*)p;
15441 return NULL;
15442}
15443
15444Py_UNICODE*
15445Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15446{
15447 const Py_UNICODE *p;
15448 p = s + Py_UNICODE_strlen(s);
15449 while (p != s) {
15450 p--;
15451 if (*p == c)
15452 return (Py_UNICODE*)p;
15453 }
15454 return NULL;
15455}
Victor Stinner331ea922010-08-10 16:37:20 +000015456
Victor Stinner71133ff2010-09-01 23:43:53 +000015457Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015458PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015459{
Victor Stinner577db2c2011-10-11 22:12:48 +020015460 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015461 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015463 if (!PyUnicode_Check(unicode)) {
15464 PyErr_BadArgument();
15465 return NULL;
15466 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015467 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015468 if (u == NULL)
15469 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015470 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015471 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015472 PyErr_NoMemory();
15473 return NULL;
15474 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015475 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015476 size *= sizeof(Py_UNICODE);
15477 copy = PyMem_Malloc(size);
15478 if (copy == NULL) {
15479 PyErr_NoMemory();
15480 return NULL;
15481 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015482 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015483 return copy;
15484}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015485
Georg Brandl66c221e2010-10-14 07:04:07 +000015486/* A _string module, to export formatter_parser and formatter_field_name_split
15487 to the string.Formatter class implemented in Python. */
15488
15489static PyMethodDef _string_methods[] = {
15490 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15491 METH_O, PyDoc_STR("split the argument as a field name")},
15492 {"formatter_parser", (PyCFunction) formatter_parser,
15493 METH_O, PyDoc_STR("parse the argument as a format string")},
15494 {NULL, NULL}
15495};
15496
15497static struct PyModuleDef _string_module = {
15498 PyModuleDef_HEAD_INIT,
15499 "_string",
15500 PyDoc_STR("string helper module"),
15501 0,
15502 _string_methods,
15503 NULL,
15504 NULL,
15505 NULL,
15506 NULL
15507};
15508
15509PyMODINIT_FUNC
15510PyInit__string(void)
15511{
15512 return PyModule_Create(&_string_module);
15513}
15514
15515
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015516#ifdef __cplusplus
15517}
15518#endif