blob: 3767064e50498177494d406b896a69b0fa2cc633 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589}
1590
Victor Stinner17222162011-09-28 22:15:37 +02001591/* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599{
1600 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001601 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602
Victor Stinnerc53be962011-10-02 21:33:54 +02001603 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 }
1617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 return 0;
1634}
1635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001636int
1637_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644#endif
1645
Georg Brandl7597add2011-10-05 16:36:47 +02001646 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001654 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
1663 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 PyErr_NoMemory();
1667 return -1;
1668 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
Victor Stinner506f5922011-09-28 22:34:18 +02001695#if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001703#else
1704 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001707 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001708 PyErr_NoMemory();
1709 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 }
Victor Stinner506f5922011-09-28 22:34:18 +02001711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726#if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750 assert(num_surrogates == 0);
1751
Victor Stinnerc3c74152011-10-02 20:39:55 +02001752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 return 0;
1763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001766unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Walter Dörwald16807132007-05-25 13:52:07 +00001768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 case SSTATE_NOT_INTERNED:
1770 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001771
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001775 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 default:
1784 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001785 }
1786
Victor Stinner03490912011-10-03 23:45:12 +02001787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001790 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001794 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811}
1812#endif
1813
Alexander Belopolsky40018472011-02-26 01:02:56 +00001814static int
Victor Stinner488fa492011-12-12 00:01:39 +01001815unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001816{
Victor Stinner488fa492011-12-12 00:01:39 +01001817 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001826#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001829#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001830 return 1;
1831}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
Victor Stinner910337b2011-10-03 03:20:16 +02001846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001857 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 return 0;
1859 }
1860
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001865 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867 }
1868
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001872 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001873 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001876 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001881{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001894}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001895
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001897
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001904{
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001907 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001912#ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001920 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001921 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001932 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
Victor Stinner184252a2012-06-16 02:57:41 +02001940 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 }
1945 }
1946}
1947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
Victor Stinnera464fc12011-10-02 20:39:30 +02001951 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001953 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001961 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinner985a82a2014-01-03 12:53:47 +01001964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
Victor Stinner985a82a2014-01-03 12:53:47 +01001974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
1977 switch (PyUnicode_KIND(unicode)) {
1978 case PyUnicode_1BYTE_KIND:
1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980 break;
1981 case PyUnicode_2BYTE_KIND:
1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983 break;
1984 default:
1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987 }
1988 assert(_PyUnicode_CheckConsistency(unicode, 1));
1989 return unicode;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992PyObject *
1993PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001995 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 Py_UCS4 maxchar = 0;
1997 Py_ssize_t num_surrogates;
1998
1999 if (u == NULL)
2000 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002002 /* If the Unicode data is known at construction time, we can apply
2003 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002006 if (size == 0)
2007 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 /* Single character Unicode objects in the Latin-1 range are
2010 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002011 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 return get_latin1_char((unsigned char)*u);
2013
2014 /* If not empty and not single character, copy the Unicode data
2015 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002016 if (find_maxchar_surrogates(u, u + size,
2017 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 return NULL;
2019
Victor Stinner8faf8212011-12-08 22:14:11 +01002020 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 if (!unicode)
2022 return NULL;
2023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 switch (PyUnicode_KIND(unicode)) {
2025 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028 break;
2029 case PyUnicode_2BYTE_KIND:
2030#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035#endif
2036 break;
2037 case PyUnicode_4BYTE_KIND:
2038#if SIZEOF_WCHAR_T == 2
2039 /* This is the only case which has to process surrogates, thus
2040 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002041 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#else
2043 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045#endif
2046 break;
2047 default:
2048 assert(0 && "Impossible state");
2049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002051 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 if (size < 0) {
2058 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002060 return NULL;
2061 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002062 if (u != NULL)
2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064 else
2065 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002066}
2067
Alexander Belopolsky40018472011-02-26 01:02:56 +00002068PyObject *
2069PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002070{
2071 size_t size = strlen(u);
2072 if (size > PY_SSIZE_T_MAX) {
2073 PyErr_SetString(PyExc_OverflowError, "input too long");
2074 return NULL;
2075 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002077}
2078
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079PyObject *
2080_PyUnicode_FromId(_Py_Identifier *id)
2081{
2082 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002083 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084 strlen(id->string),
2085 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002086 if (!id->object)
2087 return NULL;
2088 PyUnicode_InternInPlace(&id->object);
2089 assert(!id->next);
2090 id->next = static_strings;
2091 static_strings = id;
2092 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 return id->object;
2094}
2095
2096void
2097_PyUnicode_ClearStaticStrings()
2098{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002099 _Py_Identifier *tmp, *s = static_strings;
2100 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002101 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 tmp = s->next;
2103 s->next = NULL;
2104 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002105 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002106 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107}
2108
Benjamin Peterson0df54292012-03-26 14:50:32 -04002109/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002110
Victor Stinnerd3f08822012-05-29 12:57:52 +02002111PyObject*
2112_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002113{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002114 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002115 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002117#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002118 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002119#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002120 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002121 }
Victor Stinner785938e2011-12-11 20:09:03 +01002122 unicode = PyUnicode_New(size, 127);
2123 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002124 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126 assert(_PyUnicode_CheckConsistency(unicode, 1));
2127 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002128}
2129
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130static Py_UCS4
2131kind_maxchar_limit(unsigned int kind)
2132{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002133 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002134 case PyUnicode_1BYTE_KIND:
2135 return 0x80;
2136 case PyUnicode_2BYTE_KIND:
2137 return 0x100;
2138 case PyUnicode_4BYTE_KIND:
2139 return 0x10000;
2140 default:
2141 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002142 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002143 }
2144}
2145
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002146static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002147align_maxchar(Py_UCS4 maxchar)
2148{
2149 if (maxchar <= 127)
2150 return 127;
2151 else if (maxchar <= 255)
2152 return 255;
2153 else if (maxchar <= 65535)
2154 return 65535;
2155 else
2156 return MAX_UNICODE;
2157}
2158
Victor Stinner702c7342011-10-05 13:50:52 +02002159static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002160_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002164
Serhiy Storchaka678db842013-01-26 12:16:36 +02002165 if (size == 0)
2166 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002168 if (size == 1)
2169 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002171 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002172 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (!res)
2174 return NULL;
2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002176 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002178}
2179
Victor Stinnere57b1c02011-09-28 22:20:48 +02002180static PyObject*
2181_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182{
2183 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002185
Serhiy Storchaka678db842013-01-26 12:16:36 +02002186 if (size == 0)
2187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002188 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002189 if (size == 1)
2190 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002192 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002193 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (!res)
2195 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002198 else {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
Victor Stinnere57b1c02011-09-28 22:20:48 +02002206static PyObject*
2207_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208{
2209 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002211
Serhiy Storchaka678db842013-01-26 12:16:36 +02002212 if (size == 0)
2213 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002215 if (size == 1)
2216 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002218 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (!res)
2221 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 if (max_char < 256)
2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224 PyUnicode_1BYTE_DATA(res));
2225 else if (max_char < 0x10000)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227 PyUnicode_2BYTE_DATA(res));
2228 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002230 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 return res;
2232}
2233
2234PyObject*
2235PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002237 if (size < 0) {
2238 PyErr_SetString(PyExc_ValueError, "size must be positive");
2239 return NULL;
2240 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002241 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002245 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002247 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002248 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002249 PyErr_SetString(PyExc_SystemError, "invalid kind");
2250 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252}
2253
Victor Stinnerece58de2012-04-23 23:36:38 +02002254Py_UCS4
2255_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256{
2257 enum PyUnicode_Kind kind;
2258 void *startptr, *endptr;
2259
2260 assert(PyUnicode_IS_READY(unicode));
2261 assert(0 <= start);
2262 assert(end <= PyUnicode_GET_LENGTH(unicode));
2263 assert(start <= end);
2264
2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266 return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268 if (start == end)
2269 return 127;
2270
Victor Stinner94d558b2012-04-27 22:26:58 +02002271 if (PyUnicode_IS_ASCII(unicode))
2272 return 127;
2273
Victor Stinnerece58de2012-04-23 23:36:38 +02002274 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002275 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002276 endptr = (char *)startptr + end * kind;
2277 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002278 switch(kind) {
2279 case PyUnicode_1BYTE_KIND:
2280 return ucs1lib_find_max_char(startptr, endptr);
2281 case PyUnicode_2BYTE_KIND:
2282 return ucs2lib_find_max_char(startptr, endptr);
2283 case PyUnicode_4BYTE_KIND:
2284 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002285 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002286 assert(0);
2287 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002288 }
2289}
2290
Victor Stinner25a4b292011-10-06 12:31:55 +02002291/* Ensure that a string uses the most efficient storage, if it is not the
2292 case: create a new string with of the right kind. Write NULL into *p_unicode
2293 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002294static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002295unicode_adjust_maxchar(PyObject **p_unicode)
2296{
2297 PyObject *unicode, *copy;
2298 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 unsigned int kind;
2301
2302 assert(p_unicode != NULL);
2303 unicode = *p_unicode;
2304 assert(PyUnicode_IS_READY(unicode));
2305 if (PyUnicode_IS_ASCII(unicode))
2306 return;
2307
2308 len = PyUnicode_GET_LENGTH(unicode);
2309 kind = PyUnicode_KIND(unicode);
2310 if (kind == PyUnicode_1BYTE_KIND) {
2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002312 max_char = ucs1lib_find_max_char(u, u + len);
2313 if (max_char >= 128)
2314 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002315 }
2316 else if (kind == PyUnicode_2BYTE_KIND) {
2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs2lib_find_max_char(u, u + len);
2319 if (max_char >= 256)
2320 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002321 }
2322 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 max_char = ucs4lib_find_max_char(u, u + len);
2326 if (max_char >= 0x10000)
2327 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002330 if (copy != NULL)
2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 Py_DECREF(unicode);
2333 *p_unicode = copy;
2334}
2335
Victor Stinner034f6cf2011-09-30 02:26:44 +02002336PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002337_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338{
Victor Stinner87af4f22011-11-21 23:03:47 +01002339 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002340 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 if (!PyUnicode_Check(unicode)) {
2343 PyErr_BadInternalCall();
2344 return NULL;
2345 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002346 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348
Victor Stinner87af4f22011-11-21 23:03:47 +01002349 length = PyUnicode_GET_LENGTH(unicode);
2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002351 if (!copy)
2352 return NULL;
2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
Christian Heimesf051e432016-09-13 20:22:02 +02002355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002356 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002357 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002358 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002359}
2360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361
Victor Stinnerbc603d12011-10-02 01:00:40 +02002362/* Widen Unicode objects to larger buffers. Don't write terminating null
2363 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364
2365void*
2366_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 Py_ssize_t len;
2369 void *result;
2370 unsigned int skind;
2371
Benjamin Petersonbac79492012-01-14 13:34:47 -05002372 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373 return NULL;
2374
2375 len = PyUnicode_GET_LENGTH(s);
2376 skind = PyUnicode_KIND(s);
2377 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002382 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002383 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 if (!result)
2385 return PyErr_NoMemory();
2386 assert(skind == PyUnicode_1BYTE_KIND);
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS1, Py_UCS2,
2389 PyUnicode_1BYTE_DATA(s),
2390 PyUnicode_1BYTE_DATA(s) + len,
2391 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002394 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 if (!result)
2396 return PyErr_NoMemory();
2397 if (skind == PyUnicode_2BYTE_KIND) {
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS2, Py_UCS4,
2400 PyUnicode_2BYTE_DATA(s),
2401 PyUnicode_2BYTE_DATA(s) + len,
2402 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404 else {
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS4,
2408 PyUnicode_1BYTE_DATA(s),
2409 PyUnicode_1BYTE_DATA(s) + len,
2410 result);
2411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002413 default:
2414 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 }
Victor Stinner01698042011-10-04 00:04:26 +02002416 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return NULL;
2418}
2419
2420static Py_UCS4*
2421as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422 int copy_null)
2423{
2424 int kind;
2425 void *data;
2426 Py_ssize_t len, targetlen;
2427 if (PyUnicode_READY(string) == -1)
2428 return NULL;
2429 kind = PyUnicode_KIND(string);
2430 data = PyUnicode_DATA(string);
2431 len = PyUnicode_GET_LENGTH(string);
2432 targetlen = len;
2433 if (copy_null)
2434 targetlen++;
2435 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 if (!target) {
2438 PyErr_NoMemory();
2439 return NULL;
2440 }
2441 }
2442 else {
2443 if (targetsize < targetlen) {
2444 PyErr_Format(PyExc_SystemError,
2445 "string is longer than the buffer");
2446 if (copy_null && 0 < targetsize)
2447 target[0] = 0;
2448 return NULL;
2449 }
2450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 if (kind == PyUnicode_1BYTE_KIND) {
2452 Py_UCS1 *start = (Py_UCS1 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 Py_UCS2 *start = (Py_UCS2 *) data;
2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458 }
2459 else {
2460 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002461 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (copy_null)
2464 target[len] = 0;
2465 return target;
2466}
2467
2468Py_UCS4*
2469PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470 int copy_null)
2471{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002472 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
2476 return as_ucs4(string, target, targetsize, copy_null);
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4Copy(PyObject *string)
2481{
2482 return as_ucs4(string, NULL, 0, 1);
2483}
2484
2485#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002486
Alexander Belopolsky40018472011-02-26 01:02:56 +00002487PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002488PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002492 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 PyErr_BadInternalCall();
2494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496
Martin v. Löwis790465f2008-04-05 20:41:37 +00002497 if (size == -1) {
2498 size = wcslen(w);
2499 }
2500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502}
2503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002505
Victor Stinner15a11362012-10-06 23:48:20 +02002506/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2509#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002510
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002511static int
2512unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513 Py_ssize_t width, Py_ssize_t precision)
2514{
2515 Py_ssize_t length, fill, arglen;
2516 Py_UCS4 maxchar;
2517
2518 if (PyUnicode_READY(str) == -1)
2519 return -1;
2520
2521 length = PyUnicode_GET_LENGTH(str);
2522 if ((precision == -1 || precision >= length)
2523 && width <= length)
2524 return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526 if (precision != -1)
2527 length = Py_MIN(precision, length);
2528
2529 arglen = Py_MAX(length, width);
2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532 else
2533 maxchar = writer->maxchar;
2534
2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536 return -1;
2537
2538 if (width > length) {
2539 fill = width - length;
2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541 return -1;
2542 writer->pos += fill;
2543 }
2544
2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546 str, 0, length);
2547 writer->pos += length;
2548 return 0;
2549}
2550
2551static int
2552unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553 Py_ssize_t width, Py_ssize_t precision)
2554{
2555 /* UTF-8 */
2556 Py_ssize_t length;
2557 PyObject *unicode;
2558 int res;
2559
2560 length = strlen(str);
2561 if (precision != -1)
2562 length = Py_MIN(length, precision);
2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564 if (unicode == NULL)
2565 return -1;
2566
2567 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568 Py_DECREF(unicode);
2569 return res;
2570}
2571
Victor Stinner96865452011-03-01 23:44:09 +00002572static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002573unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002575{
Victor Stinnere215d962012-10-06 23:03:36 +02002576 const char *p;
2577 Py_ssize_t len;
2578 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t width;
2580 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002581 int longflag;
2582 int longlongflag;
2583 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002584 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585
2586 p = f;
2587 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002588 zeropad = 0;
2589 if (*f == '0') {
2590 zeropad = 1;
2591 f++;
2592 }
Victor Stinner96865452011-03-01 23:44:09 +00002593
2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 width = -1;
2596 if (Py_ISDIGIT((unsigned)*f)) {
2597 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002598 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002599 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002601 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002603 return NULL;
2604 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002606 f++;
2607 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 }
2609 precision = -1;
2610 if (*f == '.') {
2611 f++;
2612 if (Py_ISDIGIT((unsigned)*f)) {
2613 precision = (*f - '0');
2614 f++;
2615 while (Py_ISDIGIT((unsigned)*f)) {
2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617 PyErr_SetString(PyExc_ValueError,
2618 "precision too big");
2619 return NULL;
2620 }
2621 precision = (precision * 10) + (*f - '0');
2622 f++;
2623 }
2624 }
Victor Stinner96865452011-03-01 23:44:09 +00002625 if (*f == '%') {
2626 /* "%.3%s" => f points to "3" */
2627 f--;
2628 }
2629 }
2630 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002631 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002632 f--;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634
2635 /* Handle %ld, %lu, %lld and %llu. */
2636 longflag = 0;
2637 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002638 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002639 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longflag = 1;
2642 ++f;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002646 longlongflag = 1;
2647 f += 2;
2648 }
Victor Stinner96865452011-03-01 23:44:09 +00002649 }
2650 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002652 size_tflag = 1;
2653 ++f;
2654 }
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (f[1] == '\0')
2657 writer->overallocate = 0;
2658
2659 switch (*f) {
2660 case 'c':
2661 {
2662 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002663 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002664 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002665 "character argument not in range(0x110000)");
2666 return NULL;
2667 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002669 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002670 break;
2671 }
2672
2673 case 'i':
2674 case 'd':
2675 case 'u':
2676 case 'x':
2677 {
2678 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002679 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002680 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002681
2682 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002683 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002684 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002685 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002686 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002687 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002688 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002689 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002691 va_arg(*vargs, size_t));
2692 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned int));
2695 }
2696 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 }
2699 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002703 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002704 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002705 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002706 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002708 va_arg(*vargs, Py_ssize_t));
2709 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, int));
2712 }
2713 assert(len >= 0);
2714
Victor Stinnere215d962012-10-06 23:03:36 +02002715 if (precision < len)
2716 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717
2718 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720 return NULL;
2721
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (width > precision) {
2723 Py_UCS4 fillchar;
2724 fill = width - precision;
2725 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner15a11362012-10-06 23:48:20 +02002730 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002731 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733 return NULL;
2734 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736
Victor Stinner4a587072013-11-19 12:54:53 +01002737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002739 break;
2740 }
2741
2742 case 'p':
2743 {
2744 char number[MAX_LONG_LONG_CHARS];
2745
2746 len = sprintf(number, "%p", va_arg(*vargs, void*));
2747 assert(len >= 0);
2748
2749 /* %p is ill-defined: ensure leading 0x. */
2750 if (number[1] == 'X')
2751 number[1] = 'x';
2752 else if (number[1] != 'x') {
2753 memmove(number + 2, number,
2754 strlen(number) + 1);
2755 number[0] = '0';
2756 number[1] = 'x';
2757 len += 2;
2758 }
2759
Victor Stinner4a587072013-11-19 12:54:53 +01002760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002761 return NULL;
2762 break;
2763 }
2764
2765 case 's':
2766 {
2767 /* UTF-8 */
2768 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002771 break;
2772 }
2773
2774 case 'U':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 assert(obj && _PyUnicode_CHECK(obj));
2778
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002780 return NULL;
2781 break;
2782 }
2783
2784 case 'V':
2785 {
2786 PyObject *obj = va_arg(*vargs, PyObject *);
2787 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002788 if (obj) {
2789 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 }
2793 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 assert(str != NULL);
2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002796 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002797 }
2798 break;
2799 }
2800
2801 case 'S':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 PyObject *str;
2805 assert(obj);
2806 str = PyObject_Str(obj);
2807 if (!str)
2808 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002810 Py_DECREF(str);
2811 return NULL;
2812 }
2813 Py_DECREF(str);
2814 break;
2815 }
2816
2817 case 'R':
2818 {
2819 PyObject *obj = va_arg(*vargs, PyObject *);
2820 PyObject *repr;
2821 assert(obj);
2822 repr = PyObject_Repr(obj);
2823 if (!repr)
2824 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 Py_DECREF(repr);
2827 return NULL;
2828 }
2829 Py_DECREF(repr);
2830 break;
2831 }
2832
2833 case 'A':
2834 {
2835 PyObject *obj = va_arg(*vargs, PyObject *);
2836 PyObject *ascii;
2837 assert(obj);
2838 ascii = PyObject_ASCII(obj);
2839 if (!ascii)
2840 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 Py_DECREF(ascii);
2843 return NULL;
2844 }
2845 Py_DECREF(ascii);
2846 break;
2847 }
2848
2849 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002852 break;
2853
2854 default:
2855 /* if we stumble upon an unknown formatting code, copy the rest
2856 of the format string to the output string. (we cannot just
2857 skip the code, since there's no way to know what's in the
2858 argument list) */
2859 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
2862 f = p+len;
2863 return f;
2864 }
2865
2866 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002867 return f;
2868}
2869
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870PyObject *
2871PyUnicode_FromFormatV(const char *format, va_list vargs)
2872{
Victor Stinnere215d962012-10-06 23:03:36 +02002873 va_list vargs2;
2874 const char *f;
2875 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876
Victor Stinner8f674cc2013-04-17 23:02:17 +02002877 _PyUnicodeWriter_Init(&writer);
2878 writer.min_length = strlen(format) + 100;
2879 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002880
Benjamin Peterson0c212142016-09-20 20:39:33 -07002881 // Copy varags to be able to pass a reference to a subfunction.
2882 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002885 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 f = unicode_fromformat_arg(&writer, f, &vargs2);
2887 if (f == NULL)
2888 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 const char *p;
2892 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002893
Victor Stinnere215d962012-10-06 23:03:36 +02002894 p = f;
2895 do
2896 {
2897 if ((unsigned char)*p > 127) {
2898 PyErr_Format(PyExc_ValueError,
2899 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900 "string, got a non-ASCII byte: 0x%02x",
2901 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002902 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 }
2904 p++;
2905 }
2906 while (*p != '\0' && *p != '%');
2907 len = p - f;
2908
2909 if (*p == '\0')
2910 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002911
2912 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914
2915 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002918 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002919 return _PyUnicodeWriter_Finish(&writer);
2920
2921 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002922 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002923 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925}
2926
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 PyObject* ret;
2931 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932
2933#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#ifdef HAVE_WCHAR_H
2944
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) required to convert the unicode object. Ignore size argument.
2950
Victor Stinnerd88d9832011-09-06 02:00:05 +02002951 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002953 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 wchar_t *w,
2957 Py_ssize_t size)
2958{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 const wchar_t *wstr;
2961
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (wstr == NULL)
2964 return -1;
2965
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002971 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 return res;
2973 }
2974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002976}
2977
2978Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002979PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 wchar_t *w,
2981 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982{
2983 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyErr_BadInternalCall();
2985 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003002 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 if (buflen == -1)
3004 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003005 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003010 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003014 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 if (size != NULL)
3016 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 return buffer;
3018}
3019
Serhiy Storchaka0edffa32017-06-27 21:08:58 +03003020wchar_t*
3021_PyUnicode_AsWideCharString(PyObject *unicode)
3022{
3023 const wchar_t *wstr;
3024 wchar_t *buffer;
3025 Py_ssize_t buflen;
3026
3027 if (unicode == NULL) {
3028 PyErr_BadInternalCall();
3029 return NULL;
3030 }
3031
3032 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3033 if (wstr == NULL) {
3034 return NULL;
3035 }
3036 if (wcslen(wstr) != (size_t)buflen) {
3037 PyErr_SetString(PyExc_ValueError,
3038 "embedded null character");
3039 return NULL;
3040 }
3041
3042 buffer = PyMem_NEW(wchar_t, buflen + 1);
3043 if (buffer == NULL) {
3044 PyErr_NoMemory();
3045 return NULL;
3046 }
3047 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
3048 return buffer;
3049}
3050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003051#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
Alexander Belopolsky40018472011-02-26 01:02:56 +00003053PyObject *
3054PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003055{
Victor Stinner8faf8212011-12-08 22:14:11 +01003056 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 PyErr_SetString(PyExc_ValueError,
3058 "chr() arg not in range(0x110000)");
3059 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003060 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003061
Victor Stinner985a82a2014-01-03 12:53:47 +01003062 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003063}
3064
Alexander Belopolsky40018472011-02-26 01:02:56 +00003065PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003066PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003068 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003070 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003071 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003072 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 Py_INCREF(obj);
3074 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003075 }
3076 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 /* For a Unicode subtype that's not a Unicode object,
3078 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003079 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003080 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003081 PyErr_Format(PyExc_TypeError,
3082 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003083 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003084 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003085}
3086
Alexander Belopolsky40018472011-02-26 01:02:56 +00003087PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003088PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003089 const char *encoding,
3090 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003091{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003092 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003093 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003094
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 PyErr_BadInternalCall();
3097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003099
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003100 /* Decoding bytes objects is the most common case and should be fast */
3101 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003102 if (PyBytes_GET_SIZE(obj) == 0)
3103 _Py_RETURN_UNICODE_EMPTY();
3104 v = PyUnicode_Decode(
3105 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3106 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003107 return v;
3108 }
3109
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003110 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 PyErr_SetString(PyExc_TypeError,
3112 "decoding str is not supported");
3113 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003114 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003115
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003116 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3117 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3118 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003119 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003120 Py_TYPE(obj)->tp_name);
3121 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003122 }
Tim Petersced69f82003-09-16 20:30:58 +00003123
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003124 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003125 PyBuffer_Release(&buffer);
3126 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003128
Serhiy Storchaka05997252013-01-26 12:14:02 +02003129 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003130 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003131 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132}
3133
Victor Stinnerebe17e02016-10-12 13:57:45 +02003134/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3135 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3136 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003137int
3138_Py_normalize_encoding(const char *encoding,
3139 char *lower,
3140 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003142 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003143 char *l;
3144 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003145 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146
Victor Stinner942889a2016-09-05 15:40:10 -07003147 assert(encoding != NULL);
3148
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003149 e = encoding;
3150 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003151 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003152 punct = 0;
3153 while (1) {
3154 char c = *e;
3155 if (c == 0) {
3156 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003157 }
Victor Stinner942889a2016-09-05 15:40:10 -07003158
3159 if (Py_ISALNUM(c) || c == '.') {
3160 if (punct && l != lower) {
3161 if (l == l_end) {
3162 return 0;
3163 }
3164 *l++ = '_';
3165 }
3166 punct = 0;
3167
3168 if (l == l_end) {
3169 return 0;
3170 }
3171 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003172 }
3173 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003174 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003175 }
Victor Stinner942889a2016-09-05 15:40:10 -07003176
3177 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003178 }
3179 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003180 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003181}
3182
Alexander Belopolsky40018472011-02-26 01:02:56 +00003183PyObject *
3184PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003185 Py_ssize_t size,
3186 const char *encoding,
3187 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003188{
3189 PyObject *buffer = NULL, *unicode;
3190 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003191 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3192
3193 if (encoding == NULL) {
3194 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3195 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003196
Fred Drakee4315f52000-05-09 19:53:39 +00003197 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003198 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3199 char *lower = buflower;
3200
3201 /* Fast paths */
3202 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3203 lower += 3;
3204 if (*lower == '_') {
3205 /* Match "utf8" and "utf_8" */
3206 lower++;
3207 }
3208
3209 if (lower[0] == '8' && lower[1] == 0) {
3210 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3211 }
3212 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3213 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3214 }
3215 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3216 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3217 }
3218 }
3219 else {
3220 if (strcmp(lower, "ascii") == 0
3221 || strcmp(lower, "us_ascii") == 0) {
3222 return PyUnicode_DecodeASCII(s, size, errors);
3223 }
Steve Dowercc16be82016-09-08 10:35:16 -07003224 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003225 else if (strcmp(lower, "mbcs") == 0) {
3226 return PyUnicode_DecodeMBCS(s, size, errors);
3227 }
3228 #endif
3229 else if (strcmp(lower, "latin1") == 0
3230 || strcmp(lower, "latin_1") == 0
3231 || strcmp(lower, "iso_8859_1") == 0
3232 || strcmp(lower, "iso8859_1") == 0) {
3233 return PyUnicode_DecodeLatin1(s, size, errors);
3234 }
3235 }
Victor Stinner37296e82010-06-10 13:36:23 +00003236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237
3238 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003239 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003240 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003241 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003242 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 if (buffer == NULL)
3244 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003245 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 if (unicode == NULL)
3247 goto onError;
3248 if (!PyUnicode_Check(unicode)) {
3249 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003250 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3251 "use codecs.decode() to decode to arbitrary types",
3252 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003253 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 Py_DECREF(unicode);
3255 goto onError;
3256 }
3257 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003258 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003259
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 Py_XDECREF(buffer);
3262 return NULL;
3263}
3264
Alexander Belopolsky40018472011-02-26 01:02:56 +00003265PyObject *
3266PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003267 const char *encoding,
3268 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003269{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003270 if (!PyUnicode_Check(unicode)) {
3271 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003272 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003273 }
3274
Serhiy Storchaka00939072016-10-27 21:05:49 +03003275 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3276 "PyUnicode_AsDecodedObject() is deprecated; "
3277 "use PyCodec_Decode() to decode from str", 1) < 0)
3278 return NULL;
3279
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003280 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003282
3283 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003284 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003285}
3286
Alexander Belopolsky40018472011-02-26 01:02:56 +00003287PyObject *
3288PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003289 const char *encoding,
3290 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003291{
3292 PyObject *v;
3293
3294 if (!PyUnicode_Check(unicode)) {
3295 PyErr_BadArgument();
3296 goto onError;
3297 }
3298
Serhiy Storchaka00939072016-10-27 21:05:49 +03003299 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3300 "PyUnicode_AsDecodedUnicode() is deprecated; "
3301 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3302 return NULL;
3303
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003304 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003306
3307 /* Decode via the codec registry */
3308 v = PyCodec_Decode(unicode, encoding, errors);
3309 if (v == NULL)
3310 goto onError;
3311 if (!PyUnicode_Check(v)) {
3312 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003313 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3314 "use codecs.decode() to decode to arbitrary types",
3315 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003316 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003317 Py_DECREF(v);
3318 goto onError;
3319 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003320 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003321
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003323 return NULL;
3324}
3325
Alexander Belopolsky40018472011-02-26 01:02:56 +00003326PyObject *
3327PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003328 Py_ssize_t size,
3329 const char *encoding,
3330 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331{
3332 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003333
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 unicode = PyUnicode_FromUnicode(s, size);
3335 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3338 Py_DECREF(unicode);
3339 return v;
3340}
3341
Alexander Belopolsky40018472011-02-26 01:02:56 +00003342PyObject *
3343PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003344 const char *encoding,
3345 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346{
3347 PyObject *v;
3348
3349 if (!PyUnicode_Check(unicode)) {
3350 PyErr_BadArgument();
3351 goto onError;
3352 }
3353
Serhiy Storchaka00939072016-10-27 21:05:49 +03003354 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3355 "PyUnicode_AsEncodedObject() is deprecated; "
3356 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3357 "or PyCodec_Encode() for generic encoding", 1) < 0)
3358 return NULL;
3359
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003360 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003361 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003362
3363 /* Encode via the codec registry */
3364 v = PyCodec_Encode(unicode, encoding, errors);
3365 if (v == NULL)
3366 goto onError;
3367 return v;
3368
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003370 return NULL;
3371}
3372
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003373static size_t
3374wcstombs_errorpos(const wchar_t *wstr)
3375{
3376 size_t len;
3377#if SIZEOF_WCHAR_T == 2
3378 wchar_t buf[3];
3379#else
3380 wchar_t buf[2];
3381#endif
3382 char outbuf[MB_LEN_MAX];
3383 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003384
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003385#if SIZEOF_WCHAR_T == 2
3386 buf[2] = 0;
3387#else
3388 buf[1] = 0;
3389#endif
3390 start = wstr;
3391 while (*wstr != L'\0')
3392 {
3393 previous = wstr;
3394#if SIZEOF_WCHAR_T == 2
3395 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3396 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3397 {
3398 buf[0] = wstr[0];
3399 buf[1] = wstr[1];
3400 wstr += 2;
3401 }
3402 else {
3403 buf[0] = *wstr;
3404 buf[1] = 0;
3405 wstr++;
3406 }
3407#else
3408 buf[0] = *wstr;
3409 wstr++;
3410#endif
3411 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003412 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003413 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003414 }
3415
3416 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003417 return 0;
3418}
3419
Victor Stinner1b579672011-12-17 05:47:23 +01003420static int
3421locale_error_handler(const char *errors, int *surrogateescape)
3422{
Victor Stinner50149202015-09-22 00:26:54 +02003423 _Py_error_handler error_handler = get_error_handler(errors);
3424 switch (error_handler)
3425 {
3426 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003427 *surrogateescape = 0;
3428 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003429 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003430 *surrogateescape = 1;
3431 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003432 default:
3433 PyErr_Format(PyExc_ValueError,
3434 "only 'strict' and 'surrogateescape' error handlers "
3435 "are supported, not '%s'",
3436 errors);
3437 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003438 }
Victor Stinner1b579672011-12-17 05:47:23 +01003439}
3440
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003441PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003442PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003443{
3444 Py_ssize_t wlen, wlen2;
3445 wchar_t *wstr;
3446 PyObject *bytes = NULL;
3447 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003448 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 PyObject *exc;
3450 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003451 int surrogateescape;
3452
3453 if (locale_error_handler(errors, &surrogateescape) < 0)
3454 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455
3456 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3457 if (wstr == NULL)
3458 return NULL;
3459
3460 wlen2 = wcslen(wstr);
3461 if (wlen2 != wlen) {
3462 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003463 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003464 return NULL;
3465 }
3466
3467 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003468 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003469 char *str;
3470
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003471 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003472 if (str == NULL) {
3473 if (error_pos == (size_t)-1) {
3474 PyErr_NoMemory();
3475 PyMem_Free(wstr);
3476 return NULL;
3477 }
3478 else {
3479 goto encode_error;
3480 }
3481 }
3482 PyMem_Free(wstr);
3483
3484 bytes = PyBytes_FromString(str);
3485 PyMem_Free(str);
3486 }
3487 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003488 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003489 size_t len, len2;
3490
3491 len = wcstombs(NULL, wstr, 0);
3492 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003493 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003494 goto encode_error;
3495 }
3496
3497 bytes = PyBytes_FromStringAndSize(NULL, len);
3498 if (bytes == NULL) {
3499 PyMem_Free(wstr);
3500 return NULL;
3501 }
3502
3503 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3504 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003505 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003506 goto encode_error;
3507 }
3508 PyMem_Free(wstr);
3509 }
3510 return bytes;
3511
3512encode_error:
3513 errmsg = strerror(errno);
3514 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003515
3516 if (error_pos == (size_t)-1)
3517 error_pos = wcstombs_errorpos(wstr);
3518
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003519 PyMem_Free(wstr);
3520 Py_XDECREF(bytes);
3521
Victor Stinner2f197072011-12-17 07:08:30 +01003522 if (errmsg != NULL) {
3523 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003524 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003525 if (wstr != NULL) {
3526 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003527 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003528 } else
3529 errmsg = NULL;
3530 }
3531 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003532 reason = PyUnicode_FromString(
3533 "wcstombs() encountered an unencodable "
3534 "wide character");
3535 if (reason == NULL)
3536 return NULL;
3537
3538 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3539 "locale", unicode,
3540 (Py_ssize_t)error_pos,
3541 (Py_ssize_t)(error_pos+1),
3542 reason);
3543 Py_DECREF(reason);
3544 if (exc != NULL) {
3545 PyCodec_StrictErrors(exc);
3546 Py_XDECREF(exc);
3547 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003548 return NULL;
3549}
3550
Victor Stinnerad158722010-10-27 00:25:46 +00003551PyObject *
3552PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003553{
Steve Dowercc16be82016-09-08 10:35:16 -07003554#if defined(__APPLE__)
3555 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003556#else
Victor Stinner793b5312011-04-27 00:24:21 +02003557 PyInterpreterState *interp = PyThreadState_GET()->interp;
3558 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3559 cannot use it to encode and decode filenames before it is loaded. Load
3560 the Python codec requires to encode at least its own filename. Use the C
3561 version of the locale codec until the codec registry is initialized and
3562 the Python codec is loaded.
3563
3564 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3565 cannot only rely on it: check also interp->fscodec_initialized for
3566 subinterpreters. */
3567 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003568 return PyUnicode_AsEncodedString(unicode,
3569 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003570 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003571 }
3572 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003573 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003574 }
Victor Stinnerad158722010-10-27 00:25:46 +00003575#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003576}
3577
Alexander Belopolsky40018472011-02-26 01:02:56 +00003578PyObject *
3579PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003580 const char *encoding,
3581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582{
3583 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003584 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003585
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 if (!PyUnicode_Check(unicode)) {
3587 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 }
Fred Drakee4315f52000-05-09 19:53:39 +00003590
Victor Stinner942889a2016-09-05 15:40:10 -07003591 if (encoding == NULL) {
3592 return _PyUnicode_AsUTF8String(unicode, errors);
3593 }
3594
Fred Drakee4315f52000-05-09 19:53:39 +00003595 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003596 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3597 char *lower = buflower;
3598
3599 /* Fast paths */
3600 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3601 lower += 3;
3602 if (*lower == '_') {
3603 /* Match "utf8" and "utf_8" */
3604 lower++;
3605 }
3606
3607 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003608 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003609 }
3610 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3611 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3612 }
3613 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3614 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3615 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003616 }
Victor Stinner942889a2016-09-05 15:40:10 -07003617 else {
3618 if (strcmp(lower, "ascii") == 0
3619 || strcmp(lower, "us_ascii") == 0) {
3620 return _PyUnicode_AsASCIIString(unicode, errors);
3621 }
Steve Dowercc16be82016-09-08 10:35:16 -07003622#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003623 else if (strcmp(lower, "mbcs") == 0) {
3624 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3625 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003626#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003627 else if (strcmp(lower, "latin1") == 0 ||
3628 strcmp(lower, "latin_1") == 0 ||
3629 strcmp(lower, "iso_8859_1") == 0 ||
3630 strcmp(lower, "iso8859_1") == 0) {
3631 return _PyUnicode_AsLatin1String(unicode, errors);
3632 }
3633 }
Victor Stinner37296e82010-06-10 13:36:23 +00003634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635
3636 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003637 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003639 return NULL;
3640
3641 /* The normal path */
3642 if (PyBytes_Check(v))
3643 return v;
3644
3645 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003646 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003647 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003648 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003649
3650 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003651 "encoder %s returned bytearray instead of bytes; "
3652 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003653 encoding);
3654 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003655 Py_DECREF(v);
3656 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003657 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003659 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3660 Py_DECREF(v);
3661 return b;
3662 }
3663
3664 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003665 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3666 "use codecs.encode() to encode to arbitrary types",
3667 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003668 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003669 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003670 return NULL;
3671}
3672
Alexander Belopolsky40018472011-02-26 01:02:56 +00003673PyObject *
3674PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003675 const char *encoding,
3676 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003677{
3678 PyObject *v;
3679
3680 if (!PyUnicode_Check(unicode)) {
3681 PyErr_BadArgument();
3682 goto onError;
3683 }
3684
Serhiy Storchaka00939072016-10-27 21:05:49 +03003685 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3686 "PyUnicode_AsEncodedUnicode() is deprecated; "
3687 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3688 return NULL;
3689
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003690 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003692
3693 /* Encode via the codec registry */
3694 v = PyCodec_Encode(unicode, encoding, errors);
3695 if (v == NULL)
3696 goto onError;
3697 if (!PyUnicode_Check(v)) {
3698 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003699 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3700 "use codecs.encode() to encode to arbitrary types",
3701 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003702 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003703 Py_DECREF(v);
3704 goto onError;
3705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003707
Benjamin Peterson29060642009-01-31 22:14:21 +00003708 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 return NULL;
3710}
3711
Victor Stinner2f197072011-12-17 07:08:30 +01003712static size_t
3713mbstowcs_errorpos(const char *str, size_t len)
3714{
3715#ifdef HAVE_MBRTOWC
3716 const char *start = str;
3717 mbstate_t mbs;
3718 size_t converted;
3719 wchar_t ch;
3720
3721 memset(&mbs, 0, sizeof mbs);
3722 while (len)
3723 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003724 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003725 if (converted == 0)
3726 /* Reached end of string */
3727 break;
3728 if (converted == (size_t)-1 || converted == (size_t)-2) {
3729 /* Conversion error or incomplete character */
3730 return str - start;
3731 }
3732 else {
3733 str += converted;
3734 len -= converted;
3735 }
3736 }
3737 /* failed to find the undecodable byte sequence */
3738 return 0;
3739#endif
3740 return 0;
3741}
3742
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003743PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003744PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003745 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746{
3747 wchar_t smallbuf[256];
3748 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3749 wchar_t *wstr;
3750 size_t wlen, wlen2;
3751 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003752 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003753 size_t error_pos;
3754 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003755 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3756 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003757
3758 if (locale_error_handler(errors, &surrogateescape) < 0)
3759 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003760
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003761 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3762 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003763 return NULL;
3764 }
3765
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003766 if (surrogateescape) {
3767 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003768 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003769 if (wstr == NULL) {
3770 if (wlen == (size_t)-1)
3771 PyErr_NoMemory();
3772 else
3773 PyErr_SetFromErrno(PyExc_OSError);
3774 return NULL;
3775 }
3776
3777 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003778 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003779 }
3780 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003781 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003782#ifndef HAVE_BROKEN_MBSTOWCS
3783 wlen = mbstowcs(NULL, str, 0);
3784#else
3785 wlen = len;
3786#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003787 if (wlen == (size_t)-1)
3788 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003789 if (wlen+1 <= smallbuf_len) {
3790 wstr = smallbuf;
3791 }
3792 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003793 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003794 if (!wstr)
3795 return PyErr_NoMemory();
3796 }
3797
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003798 wlen2 = mbstowcs(wstr, str, wlen+1);
3799 if (wlen2 == (size_t)-1) {
3800 if (wstr != smallbuf)
3801 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003802 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003803 }
3804#ifdef HAVE_BROKEN_MBSTOWCS
3805 assert(wlen2 == wlen);
3806#endif
3807 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3808 if (wstr != smallbuf)
3809 PyMem_Free(wstr);
3810 }
3811 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003812
3813decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003814 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003815 errmsg = strerror(errno);
3816 assert(errmsg != NULL);
3817
3818 error_pos = mbstowcs_errorpos(str, len);
3819 if (errmsg != NULL) {
3820 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003821 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003822 if (wstr != NULL) {
3823 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003824 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003825 }
Victor Stinner2f197072011-12-17 07:08:30 +01003826 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003827 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003828 reason = PyUnicode_FromString(
3829 "mbstowcs() encountered an invalid multibyte sequence");
3830 if (reason == NULL)
3831 return NULL;
3832
3833 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3834 "locale", str, len,
3835 (Py_ssize_t)error_pos,
3836 (Py_ssize_t)(error_pos+1),
3837 reason);
3838 Py_DECREF(reason);
3839 if (exc != NULL) {
3840 PyCodec_StrictErrors(exc);
3841 Py_XDECREF(exc);
3842 }
3843 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003844}
3845
3846PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003847PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003848{
3849 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003850 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003851}
3852
3853
3854PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003855PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003856 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003857 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3858}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003859
Christian Heimes5894ba72007-11-04 11:43:14 +00003860PyObject*
3861PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3862{
Steve Dowercc16be82016-09-08 10:35:16 -07003863#if defined(__APPLE__)
3864 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003865#else
Victor Stinner793b5312011-04-27 00:24:21 +02003866 PyInterpreterState *interp = PyThreadState_GET()->interp;
3867 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3868 cannot use it to encode and decode filenames before it is loaded. Load
3869 the Python codec requires to encode at least its own filename. Use the C
3870 version of the locale codec until the codec registry is initialized and
3871 the Python codec is loaded.
3872
3873 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3874 cannot only rely on it: check also interp->fscodec_initialized for
3875 subinterpreters. */
3876 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003877 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003878 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003879 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003880 }
3881 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003882 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003883 }
Victor Stinnerad158722010-10-27 00:25:46 +00003884#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003885}
3886
Martin v. Löwis011e8422009-05-05 04:43:17 +00003887
3888int
3889PyUnicode_FSConverter(PyObject* arg, void* addr)
3890{
Brett Cannonec6ce872016-09-06 15:50:29 -07003891 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003892 PyObject *output = NULL;
3893 Py_ssize_t size;
3894 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003895 if (arg == NULL) {
3896 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003897 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003898 return 1;
3899 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003900 path = PyOS_FSPath(arg);
3901 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003902 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003903 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003904 if (PyBytes_Check(path)) {
3905 output = path;
3906 }
3907 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3908 output = PyUnicode_EncodeFSDefault(path);
3909 Py_DECREF(path);
3910 if (!output) {
3911 return 0;
3912 }
3913 assert(PyBytes_Check(output));
3914 }
3915
Victor Stinner0ea2a462010-04-30 00:22:08 +00003916 size = PyBytes_GET_SIZE(output);
3917 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003918 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003919 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003920 Py_DECREF(output);
3921 return 0;
3922 }
3923 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003924 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003925}
3926
3927
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003928int
3929PyUnicode_FSDecoder(PyObject* arg, void* addr)
3930{
Brett Cannona5711202016-09-06 19:36:01 -07003931 int is_buffer = 0;
3932 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003933 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003934 if (arg == NULL) {
3935 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka7a113a02017-04-20 22:55:06 +03003936 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003937 return 1;
3938 }
Brett Cannona5711202016-09-06 19:36:01 -07003939
3940 is_buffer = PyObject_CheckBuffer(arg);
3941 if (!is_buffer) {
3942 path = PyOS_FSPath(arg);
3943 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003944 return 0;
3945 }
Brett Cannona5711202016-09-06 19:36:01 -07003946 }
3947 else {
3948 path = arg;
3949 Py_INCREF(arg);
3950 }
3951
3952 if (PyUnicode_Check(path)) {
3953 if (PyUnicode_READY(path) == -1) {
3954 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003955 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003956 }
3957 output = path;
3958 }
3959 else if (PyBytes_Check(path) || is_buffer) {
3960 PyObject *path_bytes = NULL;
3961
3962 if (!PyBytes_Check(path) &&
3963 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3964 "path should be string, bytes, or os.PathLike, not %.200s",
3965 Py_TYPE(arg)->tp_name)) {
3966 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003967 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003968 }
3969 path_bytes = PyBytes_FromObject(path);
3970 Py_DECREF(path);
3971 if (!path_bytes) {
3972 return 0;
3973 }
3974 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3975 PyBytes_GET_SIZE(path_bytes));
3976 Py_DECREF(path_bytes);
3977 if (!output) {
3978 return 0;
3979 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003980 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003981 else {
3982 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003983 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003984 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003985 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003986 return 0;
3987 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003988 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003989 Py_DECREF(output);
3990 return 0;
3991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003993 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003994 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003995 Py_DECREF(output);
3996 return 0;
3997 }
3998 *(PyObject**)addr = output;
3999 return Py_CLEANUP_SUPPORTED;
4000}
4001
4002
Martin v. Löwis5b222132007-06-10 09:51:05 +00004003char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004004PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004005{
Christian Heimesf3863112007-11-22 07:46:41 +00004006 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004008 if (!PyUnicode_Check(unicode)) {
4009 PyErr_BadArgument();
4010 return NULL;
4011 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004012 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004013 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004015 if (PyUnicode_UTF8(unicode) == NULL) {
4016 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004017 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 if (bytes == NULL)
4019 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004020 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4021 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004022 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 Py_DECREF(bytes);
4024 return NULL;
4025 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004026 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004027 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004028 PyBytes_AS_STRING(bytes),
4029 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 Py_DECREF(bytes);
4031 }
4032
4033 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004034 *psize = PyUnicode_UTF8_LENGTH(unicode);
4035 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004036}
4037
4038char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4042}
4043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044Py_UNICODE *
4045PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 const unsigned char *one_byte;
4048#if SIZEOF_WCHAR_T == 4
4049 const Py_UCS2 *two_bytes;
4050#else
4051 const Py_UCS4 *four_bytes;
4052 const Py_UCS4 *ucs4_end;
4053 Py_ssize_t num_surrogates;
4054#endif
4055 wchar_t *w;
4056 wchar_t *wchar_end;
4057
4058 if (!PyUnicode_Check(unicode)) {
4059 PyErr_BadArgument();
4060 return NULL;
4061 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004062 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004064 assert(_PyUnicode_KIND(unicode) != 0);
4065 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004067 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004069 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4070 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 num_surrogates = 0;
4072
4073 for (; four_bytes < ucs4_end; ++four_bytes) {
4074 if (*four_bytes > 0xFFFF)
4075 ++num_surrogates;
4076 }
4077
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004078 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4079 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4080 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081 PyErr_NoMemory();
4082 return NULL;
4083 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004084 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004086 w = _PyUnicode_WSTR(unicode);
4087 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4088 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4090 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004091 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004093 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4094 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 }
4096 else
4097 *w = *four_bytes;
4098
4099 if (w > wchar_end) {
4100 assert(0 && "Miscalculated string end");
4101 }
4102 }
4103 *w = 0;
4104#else
4105 /* sizeof(wchar_t) == 4 */
4106 Py_FatalError("Impossible unicode object state, wstr and str "
4107 "should share memory already.");
4108 return NULL;
4109#endif
4110 }
4111 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004112 if ((size_t)_PyUnicode_LENGTH(unicode) >
4113 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4114 PyErr_NoMemory();
4115 return NULL;
4116 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004117 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4118 (_PyUnicode_LENGTH(unicode) + 1));
4119 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 PyErr_NoMemory();
4121 return NULL;
4122 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004123 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4124 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4125 w = _PyUnicode_WSTR(unicode);
4126 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004127
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004128 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4129 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004130 for (; w < wchar_end; ++one_byte, ++w)
4131 *w = *one_byte;
4132 /* null-terminate the wstr */
4133 *w = 0;
4134 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004135 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004137 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 for (; w < wchar_end; ++two_bytes, ++w)
4139 *w = *two_bytes;
4140 /* null-terminate the wstr */
4141 *w = 0;
4142#else
4143 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004144 PyObject_FREE(_PyUnicode_WSTR(unicode));
4145 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 Py_FatalError("Impossible unicode object state, wstr "
4147 "and str should share memory already.");
4148 return NULL;
4149#endif
4150 }
4151 else {
4152 assert(0 && "This should never happen.");
4153 }
4154 }
4155 }
4156 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004157 *size = PyUnicode_WSTR_LENGTH(unicode);
4158 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004159}
4160
Alexander Belopolsky40018472011-02-26 01:02:56 +00004161Py_UNICODE *
4162PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165}
4166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167
Alexander Belopolsky40018472011-02-26 01:02:56 +00004168Py_ssize_t
4169PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170{
4171 if (!PyUnicode_Check(unicode)) {
4172 PyErr_BadArgument();
4173 goto onError;
4174 }
4175 return PyUnicode_GET_SIZE(unicode);
4176
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 return -1;
4179}
4180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181Py_ssize_t
4182PyUnicode_GetLength(PyObject *unicode)
4183{
Victor Stinner07621332012-06-16 04:53:46 +02004184 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185 PyErr_BadArgument();
4186 return -1;
4187 }
Victor Stinner07621332012-06-16 04:53:46 +02004188 if (PyUnicode_READY(unicode) == -1)
4189 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004190 return PyUnicode_GET_LENGTH(unicode);
4191}
4192
4193Py_UCS4
4194PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4195{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004196 void *data;
4197 int kind;
4198
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004199 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4200 PyErr_BadArgument();
4201 return (Py_UCS4)-1;
4202 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004203 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004204 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 return (Py_UCS4)-1;
4206 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004207 data = PyUnicode_DATA(unicode);
4208 kind = PyUnicode_KIND(unicode);
4209 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210}
4211
4212int
4213PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4214{
4215 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004216 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004217 return -1;
4218 }
Victor Stinner488fa492011-12-12 00:01:39 +01004219 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004220 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004221 PyErr_SetString(PyExc_IndexError, "string index out of range");
4222 return -1;
4223 }
Victor Stinner488fa492011-12-12 00:01:39 +01004224 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004225 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004226 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4227 PyErr_SetString(PyExc_ValueError, "character out of range");
4228 return -1;
4229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4231 index, ch);
4232 return 0;
4233}
4234
Alexander Belopolsky40018472011-02-26 01:02:56 +00004235const char *
4236PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004237{
Victor Stinner42cb4622010-09-01 19:39:01 +00004238 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004239}
4240
Victor Stinner554f3f02010-06-16 23:33:54 +00004241/* create or adjust a UnicodeDecodeError */
4242static void
4243make_decode_exception(PyObject **exceptionObject,
4244 const char *encoding,
4245 const char *input, Py_ssize_t length,
4246 Py_ssize_t startpos, Py_ssize_t endpos,
4247 const char *reason)
4248{
4249 if (*exceptionObject == NULL) {
4250 *exceptionObject = PyUnicodeDecodeError_Create(
4251 encoding, input, length, startpos, endpos, reason);
4252 }
4253 else {
4254 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4255 goto onError;
4256 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4257 goto onError;
4258 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4259 goto onError;
4260 }
4261 return;
4262
4263onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004264 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004265}
4266
Steve Dowercc16be82016-09-08 10:35:16 -07004267#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268/* error handling callback helper:
4269 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004270 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 and adjust various state variables.
4272 return 0 on success, -1 on error
4273*/
4274
Alexander Belopolsky40018472011-02-26 01:02:56 +00004275static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004276unicode_decode_call_errorhandler_wchar(
4277 const char *errors, PyObject **errorHandler,
4278 const char *encoding, const char *reason,
4279 const char **input, const char **inend, Py_ssize_t *startinpos,
4280 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4281 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004283 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284
4285 PyObject *restuple = NULL;
4286 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004287 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004288 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004289 Py_ssize_t requiredsize;
4290 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004291 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 wchar_t *repwstr;
4293 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4296 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 *errorHandler = PyCodec_LookupError(errors);
4300 if (*errorHandler == NULL)
4301 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 }
4303
Victor Stinner554f3f02010-06-16 23:33:54 +00004304 make_decode_exception(exceptionObject,
4305 encoding,
4306 *input, *inend - *input,
4307 *startinpos, *endinpos,
4308 reason);
4309 if (*exceptionObject == NULL)
4310 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311
4312 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4313 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004316 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 }
4319 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321
4322 /* Copy back the bytes variables, which might have been modified by the
4323 callback */
4324 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4325 if (!inputobj)
4326 goto onError;
4327 if (!PyBytes_Check(inputobj)) {
4328 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4329 }
4330 *input = PyBytes_AS_STRING(inputobj);
4331 insize = PyBytes_GET_SIZE(inputobj);
4332 *inend = *input + insize;
4333 /* we can DECREF safely, as the exception has another reference,
4334 so the object won't go away. */
4335 Py_DECREF(inputobj);
4336
4337 if (newpos<0)
4338 newpos = insize+newpos;
4339 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004340 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 goto onError;
4342 }
4343
4344 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4345 if (repwstr == NULL)
4346 goto onError;
4347 /* need more space? (at least enough for what we
4348 have+the replacement+the rest of the string (starting
4349 at the new input position), so we won't have to check space
4350 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004351 requiredsize = *outpos;
4352 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4353 goto overflow;
4354 requiredsize += repwlen;
4355 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4356 goto overflow;
4357 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004358 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004359 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 requiredsize = 2*outsize;
4361 if (unicode_resize(output, requiredsize) < 0)
4362 goto onError;
4363 }
4364 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4365 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366 *endinpos = newpos;
4367 *inptr = *input + newpos;
4368
4369 /* we made it! */
4370 Py_XDECREF(restuple);
4371 return 0;
4372
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004373 overflow:
4374 PyErr_SetString(PyExc_OverflowError,
4375 "decoded result is too long for a Python string");
4376
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004377 onError:
4378 Py_XDECREF(restuple);
4379 return -1;
4380}
Steve Dowercc16be82016-09-08 10:35:16 -07004381#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004382
4383static int
4384unicode_decode_call_errorhandler_writer(
4385 const char *errors, PyObject **errorHandler,
4386 const char *encoding, const char *reason,
4387 const char **input, const char **inend, Py_ssize_t *startinpos,
4388 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4389 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4390{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004391 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392
4393 PyObject *restuple = NULL;
4394 PyObject *repunicode = NULL;
4395 Py_ssize_t insize;
4396 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004397 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398 PyObject *inputobj = NULL;
4399
4400 if (*errorHandler == NULL) {
4401 *errorHandler = PyCodec_LookupError(errors);
4402 if (*errorHandler == NULL)
4403 goto onError;
4404 }
4405
4406 make_decode_exception(exceptionObject,
4407 encoding,
4408 *input, *inend - *input,
4409 *startinpos, *endinpos,
4410 reason);
4411 if (*exceptionObject == NULL)
4412 goto onError;
4413
4414 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4415 if (restuple == NULL)
4416 goto onError;
4417 if (!PyTuple_Check(restuple)) {
4418 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4419 goto onError;
4420 }
4421 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004423
4424 /* Copy back the bytes variables, which might have been modified by the
4425 callback */
4426 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4427 if (!inputobj)
4428 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004429 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004431 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004432 *input = PyBytes_AS_STRING(inputobj);
4433 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004434 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004435 /* we can DECREF safely, as the exception has another reference,
4436 so the object won't go away. */
4437 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004441 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004442 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445
Victor Stinner8f674cc2013-04-17 23:02:17 +02004446 if (PyUnicode_READY(repunicode) < 0)
4447 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004448 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004449 if (replen > 1) {
4450 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004451 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004452 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4453 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4454 goto onError;
4455 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004457 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004460 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 Py_XDECREF(restuple);
4464 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469}
4470
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471/* --- UTF-7 Codec -------------------------------------------------------- */
4472
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473/* See RFC2152 for details. We encode conservatively and decode liberally. */
4474
4475/* Three simple macros defining base-64. */
4476
4477/* Is c a base-64 character? */
4478
4479#define IS_BASE64(c) \
4480 (((c) >= 'A' && (c) <= 'Z') || \
4481 ((c) >= 'a' && (c) <= 'z') || \
4482 ((c) >= '0' && (c) <= '9') || \
4483 (c) == '+' || (c) == '/')
4484
4485/* given that c is a base-64 character, what is its base-64 value? */
4486
4487#define FROM_BASE64(c) \
4488 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4489 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4490 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4491 (c) == '+' ? 62 : 63)
4492
4493/* What is the base-64 character of the bottom 6 bits of n? */
4494
4495#define TO_BASE64(n) \
4496 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4497
4498/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4499 * decoded as itself. We are permissive on decoding; the only ASCII
4500 * byte not decoding to itself is the + which begins a base64
4501 * string. */
4502
4503#define DECODE_DIRECT(c) \
4504 ((c) <= 127 && (c) != '+')
4505
4506/* The UTF-7 encoder treats ASCII characters differently according to
4507 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4508 * the above). See RFC2152. This array identifies these different
4509 * sets:
4510 * 0 : "Set D"
4511 * alphanumeric and '(),-./:?
4512 * 1 : "Set O"
4513 * !"#$%&*;<=>@[]^_`{|}
4514 * 2 : "whitespace"
4515 * ht nl cr sp
4516 * 3 : special (must be base64 encoded)
4517 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4518 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Tim Petersced69f82003-09-16 20:30:58 +00004520static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521char utf7_category[128] = {
4522/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4523 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4524/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4525 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4526/* sp ! " # $ % & ' ( ) * + , - . / */
4527 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4528/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4530/* @ A B C D E F G H I J K L M N O */
4531 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4532/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4533 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4534/* ` a b c d e f g h i j k l m n o */
4535 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4536/* p q r s t u v w x y z { | } ~ del */
4537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538};
4539
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540/* ENCODE_DIRECT: this character should be encoded as itself. The
4541 * answer depends on whether we are encoding set O as itself, and also
4542 * on whether we are encoding whitespace as itself. RFC2152 makes it
4543 * clear that the answers to these questions vary between
4544 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004545
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546#define ENCODE_DIRECT(c, directO, directWS) \
4547 ((c) < 128 && (c) > 0 && \
4548 ((utf7_category[(c)] == 0) || \
4549 (directWS && (utf7_category[(c)] == 2)) || \
4550 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004551
Alexander Belopolsky40018472011-02-26 01:02:56 +00004552PyObject *
4553PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004554 Py_ssize_t size,
4555 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004557 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4558}
4559
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560/* The decoder. The only state we preserve is our read position,
4561 * i.e. how many characters we have consumed. So if we end in the
4562 * middle of a shift sequence we have to back off the read position
4563 * and the output to the beginning of the sequence, otherwise we lose
4564 * all the shift state (seen bits, number of bits seen, high
4565 * surrogate). */
4566
Alexander Belopolsky40018472011-02-26 01:02:56 +00004567PyObject *
4568PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004569 Py_ssize_t size,
4570 const char *errors,
4571 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574 Py_ssize_t startinpos;
4575 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 const char *errmsg = "";
4579 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004580 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 unsigned int base64bits = 0;
4582 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004583 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 PyObject *errorHandler = NULL;
4585 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004587 if (size == 0) {
4588 if (consumed)
4589 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004590 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004591 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004593 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004594 _PyUnicodeWriter_Init(&writer);
4595 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596
4597 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 e = s + size;
4599
4600 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004601 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004603 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (inShift) { /* in a base-64 section */
4606 if (IS_BASE64(ch)) { /* consume a base-64 character */
4607 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4608 base64bits += 6;
4609 s++;
4610 if (base64bits >= 16) {
4611 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004612 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 base64bits -= 16;
4614 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004615 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 if (surrogate) {
4617 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004618 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4619 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004620 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004621 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004623 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 }
4625 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004626 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004627 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 }
4630 }
Victor Stinner551ac952011-11-29 22:58:13 +01004631 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 /* first surrogate */
4633 surrogate = outCh;
4634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004636 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004637 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 }
4639 }
4640 }
4641 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 if (base64bits > 0) { /* left-over bits */
4644 if (base64bits >= 6) {
4645 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004646 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 errmsg = "partial character in shift sequence";
4648 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 else {
4651 /* Some bits remain; they should be zero */
4652 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004653 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 errmsg = "non-zero padding bits in shift sequence";
4655 goto utf7Error;
4656 }
4657 }
4658 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004659 if (surrogate && DECODE_DIRECT(ch)) {
4660 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4661 goto onError;
4662 }
4663 surrogate = 0;
4664 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 /* '-' is absorbed; other terminating
4666 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004667 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 }
4670 }
4671 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 s++; /* consume '+' */
4674 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004676 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004677 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 }
4679 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004680 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004681 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004682 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004684 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004685 }
4686 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004689 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 else {
4693 startinpos = s-starts;
4694 s++;
4695 errmsg = "unexpected special character";
4696 goto utf7Error;
4697 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004698 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004701 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 errors, &errorHandler,
4703 "utf7", errmsg,
4704 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004705 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
4708
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709 /* end of string */
4710
4711 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4712 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004713 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 if (surrogate ||
4715 (base64bits >= 6) ||
4716 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 errors, &errorHandler,
4720 "utf7", "unterminated shift sequence",
4721 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004722 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004723 goto onError;
4724 if (s < e)
4725 goto restart;
4726 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004728
4729 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004730 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004731 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004732 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004733 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004734 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004735 writer.kind, writer.data, shiftOutStart);
4736 Py_XDECREF(errorHandler);
4737 Py_XDECREF(exc);
4738 _PyUnicodeWriter_Dealloc(&writer);
4739 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004740 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004741 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 }
4743 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004744 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004746 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 Py_XDECREF(errorHandler);
4749 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004750 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 Py_XDECREF(errorHandler);
4754 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004756 return NULL;
4757}
4758
4759
Alexander Belopolsky40018472011-02-26 01:02:56 +00004760PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004761_PyUnicode_EncodeUTF7(PyObject *str,
4762 int base64SetO,
4763 int base64WhiteSpace,
4764 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 int kind;
4767 void *data;
4768 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004769 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004771 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 unsigned int base64bits = 0;
4773 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774 char * out;
4775 char * start;
4776
Benjamin Petersonbac79492012-01-14 13:34:47 -05004777 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004778 return NULL;
4779 kind = PyUnicode_KIND(str);
4780 data = PyUnicode_DATA(str);
4781 len = PyUnicode_GET_LENGTH(str);
4782
4783 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004786 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004787 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004788 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004789 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790 if (v == NULL)
4791 return NULL;
4792
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004793 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004794 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004795 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004796
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 if (inShift) {
4798 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4799 /* shifting out */
4800 if (base64bits) { /* output remaining bits */
4801 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4802 base64buffer = 0;
4803 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004804 }
4805 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 /* Characters not in the BASE64 set implicitly unshift the sequence
4807 so no '-' is required, except if the character is itself a '-' */
4808 if (IS_BASE64(ch) || ch == '-') {
4809 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004810 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004811 *out++ = (char) ch;
4812 }
4813 else {
4814 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004815 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004816 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 else { /* not in a shift sequence */
4818 if (ch == '+') {
4819 *out++ = '+';
4820 *out++ = '-';
4821 }
4822 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4823 *out++ = (char) ch;
4824 }
4825 else {
4826 *out++ = '+';
4827 inShift = 1;
4828 goto encode_char;
4829 }
4830 }
4831 continue;
4832encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004834 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004835
Antoine Pitrou244651a2009-05-04 18:56:13 +00004836 /* code first surrogate */
4837 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004838 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004839 while (base64bits >= 6) {
4840 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4841 base64bits -= 6;
4842 }
4843 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004844 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004845 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846 base64bits += 16;
4847 base64buffer = (base64buffer << 16) | ch;
4848 while (base64bits >= 6) {
4849 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4850 base64bits -= 6;
4851 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004852 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004853 if (base64bits)
4854 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4855 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004856 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004857 if (_PyBytes_Resize(&v, out - start) < 0)
4858 return NULL;
4859 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004860}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004861PyObject *
4862PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4863 Py_ssize_t size,
4864 int base64SetO,
4865 int base64WhiteSpace,
4866 const char *errors)
4867{
4868 PyObject *result;
4869 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4870 if (tmp == NULL)
4871 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004872 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004873 base64WhiteSpace, errors);
4874 Py_DECREF(tmp);
4875 return result;
4876}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004877
Antoine Pitrou244651a2009-05-04 18:56:13 +00004878#undef IS_BASE64
4879#undef FROM_BASE64
4880#undef TO_BASE64
4881#undef DECODE_DIRECT
4882#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884/* --- UTF-8 Codec -------------------------------------------------------- */
4885
Alexander Belopolsky40018472011-02-26 01:02:56 +00004886PyObject *
4887PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004888 Py_ssize_t size,
4889 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890{
Walter Dörwald69652032004-09-07 20:24:22 +00004891 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4892}
4893
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894#include "stringlib/asciilib.h"
4895#include "stringlib/codecs.h"
4896#include "stringlib/undef.h"
4897
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004898#include "stringlib/ucs1lib.h"
4899#include "stringlib/codecs.h"
4900#include "stringlib/undef.h"
4901
4902#include "stringlib/ucs2lib.h"
4903#include "stringlib/codecs.h"
4904#include "stringlib/undef.h"
4905
4906#include "stringlib/ucs4lib.h"
4907#include "stringlib/codecs.h"
4908#include "stringlib/undef.h"
4909
Antoine Pitrouab868312009-01-10 15:40:25 +00004910/* Mask to quickly check whether a C 'long' contains a
4911 non-ASCII, UTF8-encoded char. */
4912#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004913# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004914#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004915# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004916#else
4917# error C 'long' size should be either 4 or 8!
4918#endif
4919
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920static Py_ssize_t
4921ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004922{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004924 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004925
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004926 /*
4927 * Issue #17237: m68k is a bit different from most architectures in
4928 * that objects do not use "natural alignment" - for example, int and
4929 * long are only aligned at 2-byte boundaries. Therefore the assert()
4930 * won't work; also, tests have shown that skipping the "optimised
4931 * version" will even speed up m68k.
4932 */
4933#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004935 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4936 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 /* Fast path, see in STRINGLIB(utf8_decode) for
4938 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004939 /* Help allocation */
4940 const char *_p = p;
4941 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942 while (_p < aligned_end) {
4943 unsigned long value = *(const unsigned long *) _p;
4944 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 *((unsigned long *)q) = value;
4947 _p += SIZEOF_LONG;
4948 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004949 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 p = _p;
4951 while (p < end) {
4952 if ((unsigned char)*p & 0x80)
4953 break;
4954 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004958#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004959#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 while (p < end) {
4961 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4962 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004963 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004964 /* Help allocation */
4965 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 while (_p < aligned_end) {
4967 unsigned long value = *(unsigned long *) _p;
4968 if (value & ASCII_CHAR_MASK)
4969 break;
4970 _p += SIZEOF_LONG;
4971 }
4972 p = _p;
4973 if (_p == end)
4974 break;
4975 }
4976 if ((unsigned char)*p & 0x80)
4977 break;
4978 ++p;
4979 }
4980 memcpy(dest, start, p - start);
4981 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982}
Antoine Pitrouab868312009-01-10 15:40:25 +00004983
Victor Stinner785938e2011-12-11 20:09:03 +01004984PyObject *
4985PyUnicode_DecodeUTF8Stateful(const char *s,
4986 Py_ssize_t size,
4987 const char *errors,
4988 Py_ssize_t *consumed)
4989{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004991 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004992 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993
4994 Py_ssize_t startinpos;
4995 Py_ssize_t endinpos;
4996 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004997 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004999 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01005000
5001 if (size == 0) {
5002 if (consumed)
5003 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005004 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005005 }
5006
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5008 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01005009 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 *consumed = 1;
5011 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005012 }
5013
Victor Stinner8f674cc2013-04-17 23:02:17 +02005014 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005015 writer.min_length = size;
5016 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005017 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01005018
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005019 writer.pos = ascii_decode(s, end, writer.data);
5020 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 while (s < end) {
5022 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005023 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005024
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005026 if (PyUnicode_IS_ASCII(writer.buffer))
5027 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005029 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005031 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032 } else {
5033 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005034 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005035 }
5036
5037 switch (ch) {
5038 case 0:
5039 if (s == end || consumed)
5040 goto End;
5041 errmsg = "unexpected end of data";
5042 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005043 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005044 break;
5045 case 1:
5046 errmsg = "invalid start byte";
5047 startinpos = s - starts;
5048 endinpos = startinpos + 1;
5049 break;
5050 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005051 case 3:
5052 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005053 errmsg = "invalid continuation byte";
5054 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005055 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005056 break;
5057 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005058 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 goto onError;
5060 continue;
5061 }
5062
Victor Stinner1d65d912015-10-05 13:43:50 +02005063 if (error_handler == _Py_ERROR_UNKNOWN)
5064 error_handler = get_error_handler(errors);
5065
5066 switch (error_handler) {
5067 case _Py_ERROR_IGNORE:
5068 s += (endinpos - startinpos);
5069 break;
5070
5071 case _Py_ERROR_REPLACE:
5072 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5073 goto onError;
5074 s += (endinpos - startinpos);
5075 break;
5076
5077 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005078 {
5079 Py_ssize_t i;
5080
Victor Stinner1d65d912015-10-05 13:43:50 +02005081 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5082 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005083 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005084 ch = (Py_UCS4)(unsigned char)(starts[i]);
5085 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5086 ch + 0xdc00);
5087 writer.pos++;
5088 }
5089 s += (endinpos - startinpos);
5090 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005091 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005092
5093 default:
5094 if (unicode_decode_call_errorhandler_writer(
5095 errors, &error_handler_obj,
5096 "utf-8", errmsg,
5097 &starts, &end, &startinpos, &endinpos, &exc, &s,
5098 &writer))
5099 goto onError;
5100 }
Victor Stinner785938e2011-12-11 20:09:03 +01005101 }
5102
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005104 if (consumed)
5105 *consumed = s - starts;
5106
Victor Stinner1d65d912015-10-05 13:43:50 +02005107 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005109 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110
5111onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005112 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005114 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005116}
5117
Xavier de Gaye76febd02016-12-15 20:59:58 +01005118#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119
5120/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005121 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005122
5123 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005124 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125
5126wchar_t*
5127_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5128{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 wchar_t *unicode;
5131 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132
5133 /* Note: size will always be longer than the resulting Unicode
5134 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005135 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005137 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138 if (!unicode)
5139 return NULL;
5140
5141 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 if (ch > 0xFF) {
5152#if SIZEOF_WCHAR_T == 4
5153 assert(0);
5154#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005155 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156 /* compute and append the two surrogates: */
5157 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5158 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5159#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005160 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005161 else {
5162 if (!ch && s == e)
5163 break;
5164 /* surrogateescape */
5165 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5166 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005167 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005168 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005169 return unicode;
5170}
5171
Xavier de Gaye76febd02016-12-15 20:59:58 +01005172#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005174/* Primary internal function which creates utf8 encoded bytes objects.
5175
5176 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005177 and allocate exactly as much space needed at the end. Else allocate the
5178 maximum possible needed (4 result bytes per Unicode character), and return
5179 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005180*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005181PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005182_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183{
Victor Stinner6099a032011-12-18 14:22:26 +01005184 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005185 void *data;
5186 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005188 if (!PyUnicode_Check(unicode)) {
5189 PyErr_BadArgument();
5190 return NULL;
5191 }
5192
5193 if (PyUnicode_READY(unicode) == -1)
5194 return NULL;
5195
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005196 if (PyUnicode_UTF8(unicode))
5197 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5198 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199
5200 kind = PyUnicode_KIND(unicode);
5201 data = PyUnicode_DATA(unicode);
5202 size = PyUnicode_GET_LENGTH(unicode);
5203
Benjamin Petersonead6b532011-12-20 17:23:42 -06005204 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005205 default:
5206 assert(0);
5207 case PyUnicode_1BYTE_KIND:
5208 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5209 assert(!PyUnicode_IS_ASCII(unicode));
5210 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5211 case PyUnicode_2BYTE_KIND:
5212 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5213 case PyUnicode_4BYTE_KIND:
5214 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216}
5217
Alexander Belopolsky40018472011-02-26 01:02:56 +00005218PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005219PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5220 Py_ssize_t size,
5221 const char *errors)
5222{
5223 PyObject *v, *unicode;
5224
5225 unicode = PyUnicode_FromUnicode(s, size);
5226 if (unicode == NULL)
5227 return NULL;
5228 v = _PyUnicode_AsUTF8String(unicode, errors);
5229 Py_DECREF(unicode);
5230 return v;
5231}
5232
5233PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005234PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005236 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237}
5238
Walter Dörwald41980ca2007-08-16 21:55:45 +00005239/* --- UTF-32 Codec ------------------------------------------------------- */
5240
5241PyObject *
5242PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 Py_ssize_t size,
5244 const char *errors,
5245 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246{
5247 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5248}
5249
5250PyObject *
5251PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 Py_ssize_t size,
5253 const char *errors,
5254 int *byteorder,
5255 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005256{
5257 const char *starts = s;
5258 Py_ssize_t startinpos;
5259 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005260 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005261 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005262 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005263 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005265 PyObject *errorHandler = NULL;
5266 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005267
Walter Dörwald41980ca2007-08-16 21:55:45 +00005268 q = (unsigned char *)s;
5269 e = q + size;
5270
5271 if (byteorder)
5272 bo = *byteorder;
5273
5274 /* Check for BOM marks (U+FEFF) in the input and adjust current
5275 byte order setting accordingly. In native mode, the leading BOM
5276 mark is skipped, in all other modes, it is copied to the output
5277 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005279 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005280 if (bom == 0x0000FEFF) {
5281 bo = -1;
5282 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005284 else if (bom == 0xFFFE0000) {
5285 bo = 1;
5286 q += 4;
5287 }
5288 if (byteorder)
5289 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005290 }
5291
Victor Stinnere64322e2012-10-30 23:12:47 +01005292 if (q == e) {
5293 if (consumed)
5294 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005295 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005296 }
5297
Victor Stinnere64322e2012-10-30 23:12:47 +01005298#ifdef WORDS_BIGENDIAN
5299 le = bo < 0;
5300#else
5301 le = bo <= 0;
5302#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005303 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005304
Victor Stinner8f674cc2013-04-17 23:02:17 +02005305 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005306 writer.min_length = (e - q + 3) / 4;
5307 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005309
Victor Stinnere64322e2012-10-30 23:12:47 +01005310 while (1) {
5311 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005313
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005315 enum PyUnicode_Kind kind = writer.kind;
5316 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005317 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005319 if (le) {
5320 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005321 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005322 if (ch > maxch)
5323 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005324 if (kind != PyUnicode_1BYTE_KIND &&
5325 Py_UNICODE_IS_SURROGATE(ch))
5326 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005327 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005328 q += 4;
5329 } while (q <= last);
5330 }
5331 else {
5332 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005333 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005334 if (ch > maxch)
5335 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005336 if (kind != PyUnicode_1BYTE_KIND &&
5337 Py_UNICODE_IS_SURROGATE(ch))
5338 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005339 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 q += 4;
5341 } while (q <= last);
5342 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005343 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005344 }
5345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005346 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005347 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005348 startinpos = ((const char *)q) - starts;
5349 endinpos = startinpos + 4;
5350 }
5351 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005352 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005354 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005356 startinpos = ((const char *)q) - starts;
5357 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005359 else {
5360 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005361 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005362 goto onError;
5363 q += 4;
5364 continue;
5365 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005366 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005367 startinpos = ((const char *)q) - starts;
5368 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005370
5371 /* The remaining input chars are ignored if the callback
5372 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005375 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005377 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379 }
5380
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005383
Walter Dörwald41980ca2007-08-16 21:55:45 +00005384 Py_XDECREF(errorHandler);
5385 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005386 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005389 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005390 Py_XDECREF(errorHandler);
5391 Py_XDECREF(exc);
5392 return NULL;
5393}
5394
5395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396_PyUnicode_EncodeUTF32(PyObject *str,
5397 const char *errors,
5398 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005400 enum PyUnicode_Kind kind;
5401 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005403 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005404 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005405#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005407#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005408 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005411 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005412 PyObject *errorHandler = NULL;
5413 PyObject *exc = NULL;
5414 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005415
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005416 if (!PyUnicode_Check(str)) {
5417 PyErr_BadArgument();
5418 return NULL;
5419 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005420 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005421 return NULL;
5422 kind = PyUnicode_KIND(str);
5423 data = PyUnicode_DATA(str);
5424 len = PyUnicode_GET_LENGTH(str);
5425
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005427 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005429 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005430 if (v == NULL)
5431 return NULL;
5432
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433 /* output buffer is 4-bytes aligned */
5434 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005435 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005436 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005437 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005438 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005439 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005440
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005441 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005442 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005443 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005445 else
5446 encoding = "utf-32";
5447
5448 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5450 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005451 }
5452
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005453 pos = 0;
5454 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005455 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005456
5457 if (kind == PyUnicode_2BYTE_KIND) {
5458 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5459 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 else {
5462 assert(kind == PyUnicode_4BYTE_KIND);
5463 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5464 &out, native_ordering);
5465 }
5466 if (pos == len)
5467 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005468
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005469 rep = unicode_encode_call_errorhandler(
5470 errors, &errorHandler,
5471 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005472 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 if (!rep)
5474 goto error;
5475
5476 if (PyBytes_Check(rep)) {
5477 repsize = PyBytes_GET_SIZE(rep);
5478 if (repsize & 3) {
5479 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005480 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 "surrogates not allowed");
5482 goto error;
5483 }
5484 moreunits = repsize / 4;
5485 }
5486 else {
5487 assert(PyUnicode_Check(rep));
5488 if (PyUnicode_READY(rep) < 0)
5489 goto error;
5490 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5491 if (!PyUnicode_IS_ASCII(rep)) {
5492 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005493 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 "surrogates not allowed");
5495 goto error;
5496 }
5497 }
5498
5499 /* four bytes are reserved for each surrogate */
5500 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005501 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005502 Py_ssize_t morebytes = 4 * (moreunits - 1);
5503 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5504 /* integer overflow */
5505 PyErr_NoMemory();
5506 goto error;
5507 }
5508 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5509 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005510 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005511 }
5512
5513 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005514 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005515 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005516 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005517 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005518 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5519 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005520 }
5521
5522 Py_CLEAR(rep);
5523 }
5524
5525 /* Cut back to size actually needed. This is necessary for, for example,
5526 encoding of a string containing isolated surrogates and the 'ignore'
5527 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005528 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005529 if (nsize != PyBytes_GET_SIZE(v))
5530 _PyBytes_Resize(&v, nsize);
5531 Py_XDECREF(errorHandler);
5532 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005533 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005534 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005535 error:
5536 Py_XDECREF(rep);
5537 Py_XDECREF(errorHandler);
5538 Py_XDECREF(exc);
5539 Py_XDECREF(v);
5540 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005541}
5542
Alexander Belopolsky40018472011-02-26 01:02:56 +00005543PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005544PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5545 Py_ssize_t size,
5546 const char *errors,
5547 int byteorder)
5548{
5549 PyObject *result;
5550 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5551 if (tmp == NULL)
5552 return NULL;
5553 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5554 Py_DECREF(tmp);
5555 return result;
5556}
5557
5558PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005559PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005560{
Victor Stinnerb960b342011-11-20 19:12:52 +01005561 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005562}
5563
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564/* --- UTF-16 Codec ------------------------------------------------------- */
5565
Tim Peters772747b2001-08-09 22:21:55 +00005566PyObject *
5567PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 Py_ssize_t size,
5569 const char *errors,
5570 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571{
Walter Dörwald69652032004-09-07 20:24:22 +00005572 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5573}
5574
5575PyObject *
5576PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 Py_ssize_t size,
5578 const char *errors,
5579 int *byteorder,
5580 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005583 Py_ssize_t startinpos;
5584 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005585 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005586 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005587 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005588 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005589 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 PyObject *errorHandler = NULL;
5591 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593
Tim Peters772747b2001-08-09 22:21:55 +00005594 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005595 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596
5597 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005598 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005600 /* Check for BOM marks (U+FEFF) in the input and adjust current
5601 byte order setting accordingly. In native mode, the leading BOM
5602 mark is skipped, in all other modes, it is copied to the output
5603 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 if (bo == 0 && size >= 2) {
5605 const Py_UCS4 bom = (q[1] << 8) | q[0];
5606 if (bom == 0xFEFF) {
5607 q += 2;
5608 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005610 else if (bom == 0xFFFE) {
5611 q += 2;
5612 bo = 1;
5613 }
5614 if (byteorder)
5615 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 if (q == e) {
5619 if (consumed)
5620 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005621 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005622 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623
Christian Heimes743e0cd2012-10-17 23:52:17 +02005624#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005626 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005627#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005629 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005630#endif
Tim Peters772747b2001-08-09 22:21:55 +00005631
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 /* Note: size will always be longer than the resulting Unicode
5633 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005634 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005635 writer.min_length = (e - q + 1) / 2;
5636 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 while (1) {
5640 Py_UCS4 ch = 0;
5641 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005644 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005645 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005646 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005647 native_ordering);
5648 else
5649 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005650 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 native_ordering);
5652 } else if (kind == PyUnicode_2BYTE_KIND) {
5653 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005654 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005655 native_ordering);
5656 } else {
5657 assert(kind == PyUnicode_4BYTE_KIND);
5658 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005659 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005660 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005661 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663
Antoine Pitrou63065d72012-05-15 23:48:04 +02005664 switch (ch)
5665 {
5666 case 0:
5667 /* remaining byte at the end? (size should be even) */
5668 if (q == e || consumed)
5669 goto End;
5670 errmsg = "truncated data";
5671 startinpos = ((const char *)q) - starts;
5672 endinpos = ((const char *)e) - starts;
5673 break;
5674 /* The remaining input chars are ignored if the callback
5675 chooses to skip the input */
5676 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005677 q -= 2;
5678 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005679 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005680 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005681 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005682 endinpos = ((const char *)e) - starts;
5683 break;
5684 case 2:
5685 errmsg = "illegal encoding";
5686 startinpos = ((const char *)q) - 2 - starts;
5687 endinpos = startinpos + 2;
5688 break;
5689 case 3:
5690 errmsg = "illegal UTF-16 surrogate";
5691 startinpos = ((const char *)q) - 4 - starts;
5692 endinpos = startinpos + 2;
5693 break;
5694 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005695 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005696 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 continue;
5698 }
5699
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005701 errors,
5702 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005703 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005704 &starts,
5705 (const char **)&e,
5706 &startinpos,
5707 &endinpos,
5708 &exc,
5709 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 }
5713
Antoine Pitrou63065d72012-05-15 23:48:04 +02005714End:
Walter Dörwald69652032004-09-07 20:24:22 +00005715 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 Py_XDECREF(errorHandler);
5719 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005723 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 Py_XDECREF(errorHandler);
5725 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 return NULL;
5727}
5728
Tim Peters772747b2001-08-09 22:21:55 +00005729PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730_PyUnicode_EncodeUTF16(PyObject *str,
5731 const char *errors,
5732 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005734 enum PyUnicode_Kind kind;
5735 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005737 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005738 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005739 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005740#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005741 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005742#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005743 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005744#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005745 const char *encoding;
5746 Py_ssize_t nsize, pos;
5747 PyObject *errorHandler = NULL;
5748 PyObject *exc = NULL;
5749 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005750
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005751 if (!PyUnicode_Check(str)) {
5752 PyErr_BadArgument();
5753 return NULL;
5754 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005755 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005756 return NULL;
5757 kind = PyUnicode_KIND(str);
5758 data = PyUnicode_DATA(str);
5759 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005760
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005761 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005762 if (kind == PyUnicode_4BYTE_KIND) {
5763 const Py_UCS4 *in = (const Py_UCS4 *)data;
5764 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 while (in < end) {
5766 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005767 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
5769 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005770 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005773 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 nsize = len + pairs + (byteorder == 0);
5775 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005780 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005781 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005782 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005783 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005784 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005785 }
5786 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005787 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005788 }
Tim Peters772747b2001-08-09 22:21:55 +00005789
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790 if (kind == PyUnicode_1BYTE_KIND) {
5791 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5792 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005793 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005794
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005795 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005797 }
5798 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005799 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005800 }
5801 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005802 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005803 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005804
5805 pos = 0;
5806 while (pos < len) {
5807 Py_ssize_t repsize, moreunits;
5808
5809 if (kind == PyUnicode_2BYTE_KIND) {
5810 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5811 &out, native_ordering);
5812 }
5813 else {
5814 assert(kind == PyUnicode_4BYTE_KIND);
5815 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5816 &out, native_ordering);
5817 }
5818 if (pos == len)
5819 break;
5820
5821 rep = unicode_encode_call_errorhandler(
5822 errors, &errorHandler,
5823 encoding, "surrogates not allowed",
5824 str, &exc, pos, pos + 1, &pos);
5825 if (!rep)
5826 goto error;
5827
5828 if (PyBytes_Check(rep)) {
5829 repsize = PyBytes_GET_SIZE(rep);
5830 if (repsize & 1) {
5831 raise_encode_exception(&exc, encoding,
5832 str, pos - 1, pos,
5833 "surrogates not allowed");
5834 goto error;
5835 }
5836 moreunits = repsize / 2;
5837 }
5838 else {
5839 assert(PyUnicode_Check(rep));
5840 if (PyUnicode_READY(rep) < 0)
5841 goto error;
5842 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5843 if (!PyUnicode_IS_ASCII(rep)) {
5844 raise_encode_exception(&exc, encoding,
5845 str, pos - 1, pos,
5846 "surrogates not allowed");
5847 goto error;
5848 }
5849 }
5850
5851 /* two bytes are reserved for each surrogate */
5852 if (moreunits > 1) {
5853 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5854 Py_ssize_t morebytes = 2 * (moreunits - 1);
5855 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5856 /* integer overflow */
5857 PyErr_NoMemory();
5858 goto error;
5859 }
5860 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5861 goto error;
5862 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5863 }
5864
5865 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005866 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005867 out += moreunits;
5868 } else /* rep is unicode */ {
5869 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5870 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5871 &out, native_ordering);
5872 }
5873
5874 Py_CLEAR(rep);
5875 }
5876
5877 /* Cut back to size actually needed. This is necessary for, for example,
5878 encoding of a string containing isolated surrogates and the 'ignore' handler
5879 is used. */
5880 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5881 if (nsize != PyBytes_GET_SIZE(v))
5882 _PyBytes_Resize(&v, nsize);
5883 Py_XDECREF(errorHandler);
5884 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005885 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005886 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005887 error:
5888 Py_XDECREF(rep);
5889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
5891 Py_XDECREF(v);
5892 return NULL;
5893#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894}
5895
Alexander Belopolsky40018472011-02-26 01:02:56 +00005896PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5898 Py_ssize_t size,
5899 const char *errors,
5900 int byteorder)
5901{
5902 PyObject *result;
5903 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5904 if (tmp == NULL)
5905 return NULL;
5906 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5907 Py_DECREF(tmp);
5908 return result;
5909}
5910
5911PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005912PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915}
5916
5917/* --- Unicode Escape Codec ----------------------------------------------- */
5918
Fredrik Lundh06d12682001-01-24 07:59:11 +00005919static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005920
Alexander Belopolsky40018472011-02-26 01:02:56 +00005921PyObject *
Eric V. Smith56466482016-10-31 14:46:26 -04005922_PyUnicode_DecodeUnicodeEscape(const char *s,
5923 Py_ssize_t size,
5924 const char *errors,
5925 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005928 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 PyObject *errorHandler = NULL;
5931 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005932
Eric V. Smith56466482016-10-31 14:46:26 -04005933 // so we can remember if we've seen an invalid escape char or not
5934 *first_invalid_escape = NULL;
5935
Victor Stinner62ec3312016-09-06 17:04:34 -07005936 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005937 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005938 }
5939 /* Escaped strings will always be longer than the resulting
5940 Unicode string, so we start with size here and then reduce the
5941 length after conversion to the true value.
5942 (but if the error callback returns a long replacement string
5943 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005944 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005945 writer.min_length = size;
5946 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5947 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948 }
5949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 end = s + size;
5951 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005952 unsigned char c = (unsigned char) *s++;
5953 Py_UCS4 ch;
5954 int count;
5955 Py_ssize_t startinpos;
5956 Py_ssize_t endinpos;
5957 const char *message;
5958
5959#define WRITE_ASCII_CHAR(ch) \
5960 do { \
5961 assert(ch <= 127); \
5962 assert(writer.pos < writer.size); \
5963 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5964 } while(0)
5965
5966#define WRITE_CHAR(ch) \
5967 do { \
5968 if (ch <= writer.maxchar) { \
5969 assert(writer.pos < writer.size); \
5970 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5971 } \
5972 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5973 goto onError; \
5974 } \
5975 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
5977 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 if (c != '\\') {
5979 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 continue;
5981 }
5982
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 if (s >= end) {
5986 message = "\\ at end of string";
5987 goto error;
5988 }
5989 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005992 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005995 case '\n': continue;
5996 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5997 case '\'': WRITE_ASCII_CHAR('\''); continue;
5998 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5999 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006000 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6002 case 't': WRITE_ASCII_CHAR('\t'); continue;
6003 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6004 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006005 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006007 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006008 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 case '0': case '1': case '2': case '3':
6012 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006013 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006014 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006015 ch = (ch<<3) + *s++ - '0';
6016 if (s < end && '0' <= *s && *s <= '7') {
6017 ch = (ch<<3) + *s++ - '0';
6018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006020 WRITE_CHAR(ch);
6021 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 /* hex escapes */
6024 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006027 message = "truncated \\xXX escape";
6028 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006032 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006033 message = "truncated \\uXXXX escape";
6034 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006037 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006038 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006039 message = "truncated \\UXXXXXXXX escape";
6040 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006041 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006042 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006043 ch <<= 4;
6044 if (c >= '0' && c <= '9') {
6045 ch += c - '0';
6046 }
6047 else if (c >= 'a' && c <= 'f') {
6048 ch += c - ('a' - 10);
6049 }
6050 else if (c >= 'A' && c <= 'F') {
6051 ch += c - ('A' - 10);
6052 }
6053 else {
6054 break;
6055 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006056 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006057 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006058 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 }
6060
6061 /* when we get here, ch is a 32-bit unicode character */
6062 if (ch > MAX_UNICODE) {
6063 message = "illegal Unicode character";
6064 goto error;
6065 }
6066
6067 WRITE_CHAR(ch);
6068 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 if (ucnhash_CAPI == NULL) {
6073 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006074 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6075 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 if (ucnhash_CAPI == NULL) {
6077 PyErr_SetString(
6078 PyExc_UnicodeError,
6079 "\\N escapes not supported (can't load unicodedata module)"
6080 );
6081 goto onError;
6082 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006083 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006084
6085 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006086 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006087 const char *start = ++s;
6088 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006089 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006090 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006091 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 namelen = s - start;
6093 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006095 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006096 ch = 0xffffffff; /* in case 'getcode' messes up */
6097 if (namelen <= INT_MAX &&
6098 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6099 &ch, 0)) {
6100 assert(ch <= MAX_UNICODE);
6101 WRITE_CHAR(ch);
6102 continue;
6103 }
6104 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006105 }
6106 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006107 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006108
6109 default:
Eric V. Smith56466482016-10-31 14:46:26 -04006110 if (*first_invalid_escape == NULL) {
6111 *first_invalid_escape = s-1; /* Back up one char, since we've
6112 already incremented s. */
6113 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006114 WRITE_ASCII_CHAR('\\');
6115 WRITE_CHAR(c);
6116 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006118
6119 error:
6120 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006121 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006122 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006123 errors, &errorHandler,
6124 "unicodeescape", message,
6125 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006126 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006127 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006128 }
6129 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6130 goto onError;
6131 }
6132
6133#undef WRITE_ASCII_CHAR
6134#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006136
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006137 Py_XDECREF(errorHandler);
6138 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006139 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006140
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006142 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 Py_XDECREF(errorHandler);
6144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 return NULL;
6146}
6147
Eric V. Smith56466482016-10-31 14:46:26 -04006148PyObject *
6149PyUnicode_DecodeUnicodeEscape(const char *s,
6150 Py_ssize_t size,
6151 const char *errors)
6152{
6153 const char *first_invalid_escape;
6154 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6155 &first_invalid_escape);
6156 if (result == NULL)
6157 return NULL;
6158 if (first_invalid_escape != NULL) {
6159 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6160 "invalid escape sequence '\\%c'",
6161 *first_invalid_escape) < 0) {
6162 Py_DECREF(result);
6163 return NULL;
6164 }
6165 }
6166 return result;
6167}
6168
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006169/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170
Alexander Belopolsky40018472011-02-26 01:02:56 +00006171PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006175 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Ezio Melottie7f90372012-10-05 03:33:31 +03006181 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006182 escape.
6183
Ezio Melottie7f90372012-10-05 03:33:31 +03006184 For UCS1 strings it's '\xxx', 4 bytes per source character.
6185 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6186 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006187 */
6188
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 if (!PyUnicode_Check(unicode)) {
6190 PyErr_BadArgument();
6191 return NULL;
6192 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 }
Victor Stinner358af132015-10-12 22:36:57 +02006196
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006197 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 if (len == 0) {
6199 return PyBytes_FromStringAndSize(NULL, 0);
6200 }
6201
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202 kind = PyUnicode_KIND(unicode);
6203 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006204 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6205 bytes, and 1 byte characters 4. */
6206 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006207 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006208 return PyErr_NoMemory();
6209 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006210 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 if (repr == NULL) {
6212 return NULL;
6213 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006214
Victor Stinner62ec3312016-09-06 17:04:34 -07006215 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006217 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006218
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 /* U+0000-U+00ff range */
6220 if (ch < 0x100) {
6221 if (ch >= ' ' && ch < 127) {
6222 if (ch != '\\') {
6223 /* Copy printable US ASCII as-is */
6224 *p++ = (char) ch;
6225 }
6226 /* Escape backslashes */
6227 else {
6228 *p++ = '\\';
6229 *p++ = '\\';
6230 }
6231 }
Victor Stinner358af132015-10-12 22:36:57 +02006232
Victor Stinner62ec3312016-09-06 17:04:34 -07006233 /* Map special whitespace to '\t', \n', '\r' */
6234 else if (ch == '\t') {
6235 *p++ = '\\';
6236 *p++ = 't';
6237 }
6238 else if (ch == '\n') {
6239 *p++ = '\\';
6240 *p++ = 'n';
6241 }
6242 else if (ch == '\r') {
6243 *p++ = '\\';
6244 *p++ = 'r';
6245 }
6246
6247 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6248 else {
6249 *p++ = '\\';
6250 *p++ = 'x';
6251 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6252 *p++ = Py_hexdigits[ch & 0x000F];
6253 }
Tim Petersced69f82003-09-16 20:30:58 +00006254 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006255 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 *p++ = '\\';
6258 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006259 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6260 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6261 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6262 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6265 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006266
Victor Stinner62ec3312016-09-06 17:04:34 -07006267 /* Make sure that the first two digits are zero */
6268 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006269 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 *p++ = 'U';
6271 *p++ = '0';
6272 *p++ = '0';
6273 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6274 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6275 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6276 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6277 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6278 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 assert(p - PyBytes_AS_STRING(repr) > 0);
6283 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6284 return NULL;
6285 }
6286 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287}
6288
Alexander Belopolsky40018472011-02-26 01:02:56 +00006289PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006290PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6291 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006293 PyObject *result;
6294 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 }
6298
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006299 result = PyUnicode_AsUnicodeEscapeString(tmp);
6300 Py_DECREF(tmp);
6301 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302}
6303
6304/* --- Raw Unicode Escape Codec ------------------------------------------- */
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyObject *
6307PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006308 Py_ssize_t size,
6309 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006312 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006314 PyObject *errorHandler = NULL;
6315 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006316
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006318 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006320
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 /* Escaped strings will always be longer than the resulting
6322 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006323 length after conversion to the true value. (But decoding error
6324 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006325 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 writer.min_length = size;
6327 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6328 goto onError;
6329 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006330
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 end = s + size;
6332 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 unsigned char c = (unsigned char) *s++;
6334 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006335 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 Py_ssize_t startinpos;
6337 Py_ssize_t endinpos;
6338 const char *message;
6339
6340#define WRITE_CHAR(ch) \
6341 do { \
6342 if (ch <= writer.maxchar) { \
6343 assert(writer.pos < writer.size); \
6344 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6345 } \
6346 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6347 goto onError; \
6348 } \
6349 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006352 if (c != '\\' || s >= end) {
6353 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006356
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 c = (unsigned char) *s++;
6358 if (c == 'u') {
6359 count = 4;
6360 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006362 else if (c == 'U') {
6363 count = 8;
6364 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006365 }
6366 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006367 assert(writer.pos < writer.size);
6368 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6369 WRITE_CHAR(c);
6370 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006371 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 startinpos = s - starts - 2;
6373
6374 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6375 for (ch = 0; count && s < end; ++s, --count) {
6376 c = (unsigned char)*s;
6377 ch <<= 4;
6378 if (c >= '0' && c <= '9') {
6379 ch += c - '0';
6380 }
6381 else if (c >= 'a' && c <= 'f') {
6382 ch += c - ('a' - 10);
6383 }
6384 else if (c >= 'A' && c <= 'F') {
6385 ch += c - ('A' - 10);
6386 }
6387 else {
6388 break;
6389 }
6390 }
6391 if (!count) {
6392 if (ch <= MAX_UNICODE) {
6393 WRITE_CHAR(ch);
6394 continue;
6395 }
6396 message = "\\Uxxxxxxxx out of range";
6397 }
6398
6399 endinpos = s-starts;
6400 writer.min_length = end - s + writer.pos;
6401 if (unicode_decode_call_errorhandler_writer(
6402 errors, &errorHandler,
6403 "rawunicodeescape", message,
6404 &starts, &end, &startinpos, &endinpos, &exc, &s,
6405 &writer)) {
6406 goto onError;
6407 }
6408 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6409 goto onError;
6410 }
6411
6412#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 Py_XDECREF(errorHandler);
6415 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006416 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006417
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006419 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 Py_XDECREF(errorHandler);
6421 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006423
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424}
6425
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429{
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006433 int kind;
6434 void *data;
6435 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006437 if (!PyUnicode_Check(unicode)) {
6438 PyErr_BadArgument();
6439 return NULL;
6440 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006442 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444 kind = PyUnicode_KIND(unicode);
6445 data = PyUnicode_DATA(unicode);
6446 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 if (kind == PyUnicode_1BYTE_KIND) {
6448 return PyBytes_FromStringAndSize(data, len);
6449 }
Victor Stinner0e368262011-11-10 20:12:49 +01006450
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6452 bytes, and 1 byte characters 4. */
6453 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006454
Victor Stinner62ec3312016-09-06 17:04:34 -07006455 if (len > PY_SSIZE_T_MAX / expandsize) {
6456 return PyErr_NoMemory();
6457 }
6458 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6459 if (repr == NULL) {
6460 return NULL;
6461 }
6462 if (len == 0) {
6463 return repr;
6464 }
6465
6466 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006467 for (pos = 0; pos < len; pos++) {
6468 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006469
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6471 if (ch < 0x100) {
6472 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006473 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6475 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 *p++ = '\\';
6477 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006478 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6479 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6480 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6481 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006483 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6484 else {
6485 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6486 *p++ = '\\';
6487 *p++ = 'U';
6488 *p++ = '0';
6489 *p++ = '0';
6490 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6491 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6492 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6493 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6494 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6495 *p++ = Py_hexdigits[ch & 15];
6496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006498
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 assert(p > PyBytes_AS_STRING(repr));
6500 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6501 return NULL;
6502 }
6503 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
Alexander Belopolsky40018472011-02-26 01:02:56 +00006506PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006507PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6508 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006510 PyObject *result;
6511 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6512 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006513 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006514 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6515 Py_DECREF(tmp);
6516 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517}
6518
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006519/* --- Unicode Internal Codec ------------------------------------------- */
6520
Alexander Belopolsky40018472011-02-26 01:02:56 +00006521PyObject *
6522_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006523 Py_ssize_t size,
6524 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006525{
6526 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006527 Py_ssize_t startinpos;
6528 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006529 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006530 const char *end;
6531 const char *reason;
6532 PyObject *errorHandler = NULL;
6533 PyObject *exc = NULL;
6534
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006535 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006536 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006537 1))
6538 return NULL;
6539
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006540 if (size == 0)
6541 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006542
Victor Stinner8f674cc2013-04-17 23:02:17 +02006543 _PyUnicodeWriter_Init(&writer);
6544 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6545 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006547 }
6548 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006549
Victor Stinner8f674cc2013-04-17 23:02:17 +02006550 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006551 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006552 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006553 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006554 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006555 endinpos = end-starts;
6556 reason = "truncated input";
6557 goto error;
6558 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006559 /* We copy the raw representation one byte at a time because the
6560 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006561 ((char *) &uch)[0] = s[0];
6562 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006563#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006564 ((char *) &uch)[2] = s[2];
6565 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006566#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006567 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006568#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006569 /* We have to sanity check the raw data, otherwise doom looms for
6570 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006571 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006572 endinpos = s - starts + Py_UNICODE_SIZE;
6573 reason = "illegal code point (> 0x10FFFF)";
6574 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006576#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006577 s += Py_UNICODE_SIZE;
6578#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006579 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006580 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006581 Py_UNICODE uch2;
6582 ((char *) &uch2)[0] = s[0];
6583 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006584 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006585 {
Victor Stinner551ac952011-11-29 22:58:13 +01006586 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006587 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006588 }
6589 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006590#endif
6591
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006592 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006593 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006594 continue;
6595
6596 error:
6597 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006598 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006599 errors, &errorHandler,
6600 "unicode_internal", reason,
6601 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006602 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006603 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006604 }
6605
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006606 Py_XDECREF(errorHandler);
6607 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006608 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006609
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006611 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006612 Py_XDECREF(errorHandler);
6613 Py_XDECREF(exc);
6614 return NULL;
6615}
6616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617/* --- Latin-1 Codec ------------------------------------------------------ */
6618
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619PyObject *
6620PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006621 Py_ssize_t size,
6622 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006625 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626}
6627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629static void
6630make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006631 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006632 PyObject *unicode,
6633 Py_ssize_t startpos, Py_ssize_t endpos,
6634 const char *reason)
6635{
6636 if (*exceptionObject == NULL) {
6637 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006639 encoding, unicode, startpos, endpos, reason);
6640 }
6641 else {
6642 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6643 goto onError;
6644 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6645 goto onError;
6646 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6647 goto onError;
6648 return;
6649 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006650 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006651 }
6652}
6653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655static void
6656raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006657 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006658 PyObject *unicode,
6659 Py_ssize_t startpos, Py_ssize_t endpos,
6660 const char *reason)
6661{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006662 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006663 encoding, unicode, startpos, endpos, reason);
6664 if (*exceptionObject != NULL)
6665 PyCodec_StrictErrors(*exceptionObject);
6666}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667
6668/* error handling callback helper:
6669 build arguments, call the callback and check the arguments,
6670 put the result into newpos and return the replacement string, which
6671 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006672static PyObject *
6673unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006674 PyObject **errorHandler,
6675 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006676 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006677 Py_ssize_t startpos, Py_ssize_t endpos,
6678 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006680 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006681 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 PyObject *restuple;
6683 PyObject *resunicode;
6684
6685 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 }
6690
Benjamin Petersonbac79492012-01-14 13:34:47 -05006691 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006692 return NULL;
6693 len = PyUnicode_GET_LENGTH(unicode);
6694
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006695 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006696 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699
6700 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006705 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 Py_DECREF(restuple);
6707 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006709 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 &resunicode, newpos)) {
6711 Py_DECREF(restuple);
6712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006714 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6715 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6716 Py_DECREF(restuple);
6717 return NULL;
6718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 *newpos = len + *newpos;
6721 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006722 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 Py_DECREF(restuple);
6724 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006725 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726 Py_INCREF(resunicode);
6727 Py_DECREF(restuple);
6728 return resunicode;
6729}
6730
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006733 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006734 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006736 /* input state */
6737 Py_ssize_t pos=0, size;
6738 int kind;
6739 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 /* pointer into the output */
6741 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006742 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6743 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006744 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006746 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006747 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006748 /* output object */
6749 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750
Benjamin Petersonbac79492012-01-14 13:34:47 -05006751 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 return NULL;
6753 size = PyUnicode_GET_LENGTH(unicode);
6754 kind = PyUnicode_KIND(unicode);
6755 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756 /* allocate enough for a simple encoding without
6757 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006758 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006759 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006760
6761 _PyBytesWriter_Init(&writer);
6762 str = _PyBytesWriter_Alloc(&writer, size);
6763 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006764 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006767 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006770 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006772 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006774 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006776 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006779 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006781
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006782 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006784
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006785 /* Only overallocate the buffer if it's not the last write */
6786 writer.overallocate = (collend < size);
6787
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006789 if (error_handler == _Py_ERROR_UNKNOWN)
6790 error_handler = get_error_handler(errors);
6791
6792 switch (error_handler) {
6793 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006794 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006796
6797 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006798 memset(str, '?', collend - collstart);
6799 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006800 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006801 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006802 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 break;
Victor Stinner50149202015-09-22 00:26:54 +02006804
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006805 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006806 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006807 writer.min_size -= (collend - collstart);
6808 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006809 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006810 if (str == NULL)
6811 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006812 pos = collend;
6813 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006814
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006815 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006816 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006817 writer.min_size -= (collend - collstart);
6818 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006819 unicode, collstart, collend);
6820 if (str == NULL)
6821 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006822 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 break;
Victor Stinner50149202015-09-22 00:26:54 +02006824
Victor Stinnerc3713e92015-09-29 12:32:13 +02006825 case _Py_ERROR_SURROGATEESCAPE:
6826 for (i = collstart; i < collend; ++i) {
6827 ch = PyUnicode_READ(kind, data, i);
6828 if (ch < 0xdc80 || 0xdcff < ch) {
6829 /* Not a UTF-8b surrogate */
6830 break;
6831 }
6832 *str++ = (char)(ch - 0xdc00);
6833 ++pos;
6834 }
6835 if (i >= collend)
6836 break;
6837 collstart = pos;
6838 assert(collstart != collend);
6839 /* fallback to general error handling */
6840
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6843 encoding, reason, unicode, &exc,
6844 collstart, collend, &newpos);
6845 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006847
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006848 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006849 writer.min_size -= 1;
6850
Victor Stinner6bd525b2015-10-09 13:10:05 +02006851 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006852 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006853 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006854 PyBytes_AS_STRING(rep),
6855 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006856 if (str == NULL)
6857 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006858 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006859 else {
6860 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006861
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006864
6865 if (PyUnicode_IS_ASCII(rep)) {
6866 /* Fast path: all characters are smaller than limit */
6867 assert(limit >= 128);
6868 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6869 str = _PyBytesWriter_WriteBytes(&writer, str,
6870 PyUnicode_DATA(rep),
6871 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006873 else {
6874 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6875
6876 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6877 if (str == NULL)
6878 goto onError;
6879
6880 /* check if there is anything unencodable in the
6881 replacement and copy it to the output */
6882 for (i = 0; repsize-->0; ++i, ++str) {
6883 ch = PyUnicode_READ_CHAR(rep, i);
6884 if (ch >= limit) {
6885 raise_encode_exception(&exc, encoding, unicode,
6886 pos, pos+1, reason);
6887 goto onError;
6888 }
6889 *str = (char)ch;
6890 }
6891 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006894 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006895 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006896
6897 /* If overallocation was disabled, ensure that it was the last
6898 write. Otherwise, we missed an optimization */
6899 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006900 }
6901 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006902
Victor Stinner50149202015-09-22 00:26:54 +02006903 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006905 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006906
6907 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006908 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006909 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006910 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006911 Py_XDECREF(exc);
6912 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913}
6914
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006915/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006916PyObject *
6917PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006918 Py_ssize_t size,
6919 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006921 PyObject *result;
6922 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6923 if (unicode == NULL)
6924 return NULL;
6925 result = unicode_encode_ucs1(unicode, errors, 256);
6926 Py_DECREF(unicode);
6927 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006931_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932{
6933 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 PyErr_BadArgument();
6935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006937 if (PyUnicode_READY(unicode) == -1)
6938 return NULL;
6939 /* Fast path: if it is a one-byte string, construct
6940 bytes object directly. */
6941 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6942 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6943 PyUnicode_GET_LENGTH(unicode));
6944 /* Non-Latin-1 characters present. Defer to above function to
6945 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006946 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006947}
6948
6949PyObject*
6950PyUnicode_AsLatin1String(PyObject *unicode)
6951{
6952 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953}
6954
6955/* --- 7-bit ASCII Codec -------------------------------------------------- */
6956
Alexander Belopolsky40018472011-02-26 01:02:56 +00006957PyObject *
6958PyUnicode_DecodeASCII(const char *s,
6959 Py_ssize_t size,
6960 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006963 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006964 int kind;
6965 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006966 Py_ssize_t startinpos;
6967 Py_ssize_t endinpos;
6968 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006971 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006972 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006975 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006976
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006978 if (size == 1 && (unsigned char)s[0] < 128)
6979 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006980
Victor Stinner8f674cc2013-04-17 23:02:17 +02006981 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006982 writer.min_length = size;
6983 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006984 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006986 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006987 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006988 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006989 writer.pos = outpos;
6990 if (writer.pos == size)
6991 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006992
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006993 s += writer.pos;
6994 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006995 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006996 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006998 PyUnicode_WRITE(kind, data, writer.pos, c);
6999 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007001 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007003
7004 /* byte outsize range 0x00..0x7f: call the error handler */
7005
7006 if (error_handler == _Py_ERROR_UNKNOWN)
7007 error_handler = get_error_handler(errors);
7008
7009 switch (error_handler)
7010 {
7011 case _Py_ERROR_REPLACE:
7012 case _Py_ERROR_SURROGATEESCAPE:
7013 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007014 but we may switch to UCS2 at the first write */
7015 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7016 goto onError;
7017 kind = writer.kind;
7018 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007019
7020 if (error_handler == _Py_ERROR_REPLACE)
7021 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7022 else
7023 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7024 writer.pos++;
7025 ++s;
7026 break;
7027
7028 case _Py_ERROR_IGNORE:
7029 ++s;
7030 break;
7031
7032 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 startinpos = s-starts;
7034 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007035 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007036 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 "ascii", "ordinal not in range(128)",
7038 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007039 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007041 kind = writer.kind;
7042 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007045 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007046 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007047 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007048
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007050 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007051 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007052 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 return NULL;
7054}
7055
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007056/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007057PyObject *
7058PyUnicode_EncodeASCII(const Py_UNICODE *p,
7059 Py_ssize_t size,
7060 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007062 PyObject *result;
7063 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7064 if (unicode == NULL)
7065 return NULL;
7066 result = unicode_encode_ucs1(unicode, errors, 128);
7067 Py_DECREF(unicode);
7068 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069}
7070
Alexander Belopolsky40018472011-02-26 01:02:56 +00007071PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007072_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073{
7074 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 PyErr_BadArgument();
7076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007078 if (PyUnicode_READY(unicode) == -1)
7079 return NULL;
7080 /* Fast path: if it is an ASCII-only string, construct bytes object
7081 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007082 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007083 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7084 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007085 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007086}
7087
7088PyObject *
7089PyUnicode_AsASCIIString(PyObject *unicode)
7090{
7091 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092}
7093
Steve Dowercc16be82016-09-08 10:35:16 -07007094#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007095
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007096/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007097
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007098#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099#define NEED_RETRY
7100#endif
7101
Victor Stinner3a50e702011-10-18 21:21:00 +02007102#ifndef WC_ERR_INVALID_CHARS
7103# define WC_ERR_INVALID_CHARS 0x0080
7104#endif
7105
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007106static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007107code_page_name(UINT code_page, PyObject **obj)
7108{
7109 *obj = NULL;
7110 if (code_page == CP_ACP)
7111 return "mbcs";
7112 if (code_page == CP_UTF7)
7113 return "CP_UTF7";
7114 if (code_page == CP_UTF8)
7115 return "CP_UTF8";
7116
7117 *obj = PyBytes_FromFormat("cp%u", code_page);
7118 if (*obj == NULL)
7119 return NULL;
7120 return PyBytes_AS_STRING(*obj);
7121}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122
Victor Stinner3a50e702011-10-18 21:21:00 +02007123static DWORD
7124decode_code_page_flags(UINT code_page)
7125{
7126 if (code_page == CP_UTF7) {
7127 /* The CP_UTF7 decoder only supports flags=0 */
7128 return 0;
7129 }
7130 else
7131 return MB_ERR_INVALID_CHARS;
7132}
7133
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 * Decode a byte string from a Windows code page into unicode object in strict
7136 * mode.
7137 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007138 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7139 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007140 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007141static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007142decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007143 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 const char *in,
7145 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007146{
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007148 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150
7151 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 assert(insize > 0);
7153 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7154 if (outsize <= 0)
7155 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156
7157 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007159 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007160 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 if (*v == NULL)
7162 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164 }
7165 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007168 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007171 }
7172
7173 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7175 if (outsize <= 0)
7176 goto error;
7177 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007178
Victor Stinner3a50e702011-10-18 21:21:00 +02007179error:
7180 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7181 return -2;
7182 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007183 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007184}
7185
Victor Stinner3a50e702011-10-18 21:21:00 +02007186/*
7187 * Decode a byte string from a code page into unicode object with an error
7188 * handler.
7189 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007190 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 * UnicodeDecodeError exception and returns -1 on error.
7192 */
7193static int
7194decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007195 PyObject **v,
7196 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007197 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007198{
7199 const char *startin = in;
7200 const char *endin = in + size;
7201 const DWORD flags = decode_code_page_flags(code_page);
7202 /* Ideally, we should get reason from FormatMessage. This is the Windows
7203 2000 English version of the message. */
7204 const char *reason = "No mapping for the Unicode character exists "
7205 "in the target code page.";
7206 /* each step cannot decode more than 1 character, but a character can be
7207 represented as a surrogate pair */
7208 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007209 int insize;
7210 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 PyObject *errorHandler = NULL;
7212 PyObject *exc = NULL;
7213 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007214 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 DWORD err;
7216 int ret = -1;
7217
7218 assert(size > 0);
7219
7220 encoding = code_page_name(code_page, &encoding_obj);
7221 if (encoding == NULL)
7222 return -1;
7223
Victor Stinner7d00cc12014-03-17 23:08:06 +01007224 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7226 UnicodeDecodeError. */
7227 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7228 if (exc != NULL) {
7229 PyCodec_StrictErrors(exc);
7230 Py_CLEAR(exc);
7231 }
7232 goto error;
7233 }
7234
7235 if (*v == NULL) {
7236 /* Create unicode object */
7237 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7238 PyErr_NoMemory();
7239 goto error;
7240 }
Victor Stinnerab595942011-12-17 04:59:06 +01007241 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007242 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 if (*v == NULL)
7244 goto error;
7245 startout = PyUnicode_AS_UNICODE(*v);
7246 }
7247 else {
7248 /* Extend unicode object */
7249 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7250 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7251 PyErr_NoMemory();
7252 goto error;
7253 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007254 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 goto error;
7256 startout = PyUnicode_AS_UNICODE(*v) + n;
7257 }
7258
7259 /* Decode the byte string character per character */
7260 out = startout;
7261 while (in < endin)
7262 {
7263 /* Decode a character */
7264 insize = 1;
7265 do
7266 {
7267 outsize = MultiByteToWideChar(code_page, flags,
7268 in, insize,
7269 buffer, Py_ARRAY_LENGTH(buffer));
7270 if (outsize > 0)
7271 break;
7272 err = GetLastError();
7273 if (err != ERROR_NO_UNICODE_TRANSLATION
7274 && err != ERROR_INSUFFICIENT_BUFFER)
7275 {
7276 PyErr_SetFromWindowsErr(0);
7277 goto error;
7278 }
7279 insize++;
7280 }
7281 /* 4=maximum length of a UTF-8 sequence */
7282 while (insize <= 4 && (in + insize) <= endin);
7283
7284 if (outsize <= 0) {
7285 Py_ssize_t startinpos, endinpos, outpos;
7286
Victor Stinner7d00cc12014-03-17 23:08:06 +01007287 /* last character in partial decode? */
7288 if (in + insize >= endin && !final)
7289 break;
7290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 startinpos = in - startin;
7292 endinpos = startinpos + 1;
7293 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007294 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 errors, &errorHandler,
7296 encoding, reason,
7297 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007298 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 {
7300 goto error;
7301 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007302 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 }
7304 else {
7305 in += insize;
7306 memcpy(out, buffer, outsize * sizeof(wchar_t));
7307 out += outsize;
7308 }
7309 }
7310
7311 /* write a NUL character at the end */
7312 *out = 0;
7313
7314 /* Extend unicode object */
7315 outsize = out - startout;
7316 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007317 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007319 /* (in - startin) <= size and size is an int */
7320 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007321
7322error:
7323 Py_XDECREF(encoding_obj);
7324 Py_XDECREF(errorHandler);
7325 Py_XDECREF(exc);
7326 return ret;
7327}
7328
Victor Stinner3a50e702011-10-18 21:21:00 +02007329static PyObject *
7330decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 const char *s, Py_ssize_t size,
7332 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333{
Victor Stinner76a31a62011-11-04 00:05:13 +01007334 PyObject *v = NULL;
7335 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 if (code_page < 0) {
7338 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7339 return NULL;
7340 }
7341
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344
Victor Stinner76a31a62011-11-04 00:05:13 +01007345 do
7346 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007348 if (size > INT_MAX) {
7349 chunk_size = INT_MAX;
7350 final = 0;
7351 done = 0;
7352 }
7353 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007355 {
7356 chunk_size = (int)size;
7357 final = (consumed == NULL);
7358 done = 1;
7359 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360
Victor Stinner76a31a62011-11-04 00:05:13 +01007361 if (chunk_size == 0 && done) {
7362 if (v != NULL)
7363 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007364 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007365 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366
Victor Stinner76a31a62011-11-04 00:05:13 +01007367 converted = decode_code_page_strict(code_page, &v,
7368 s, chunk_size);
7369 if (converted == -2)
7370 converted = decode_code_page_errors(code_page, &v,
7371 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007372 errors, final);
7373 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007374
7375 if (converted < 0) {
7376 Py_XDECREF(v);
7377 return NULL;
7378 }
7379
7380 if (consumed)
7381 *consumed += converted;
7382
7383 s += converted;
7384 size -= converted;
7385 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007386
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007387 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388}
7389
Alexander Belopolsky40018472011-02-26 01:02:56 +00007390PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007391PyUnicode_DecodeCodePageStateful(int code_page,
7392 const char *s,
7393 Py_ssize_t size,
7394 const char *errors,
7395 Py_ssize_t *consumed)
7396{
7397 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7398}
7399
7400PyObject *
7401PyUnicode_DecodeMBCSStateful(const char *s,
7402 Py_ssize_t size,
7403 const char *errors,
7404 Py_ssize_t *consumed)
7405{
7406 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7407}
7408
7409PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007410PyUnicode_DecodeMBCS(const char *s,
7411 Py_ssize_t size,
7412 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007413{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007414 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7415}
7416
Victor Stinner3a50e702011-10-18 21:21:00 +02007417static DWORD
7418encode_code_page_flags(UINT code_page, const char *errors)
7419{
7420 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007421 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 }
7423 else if (code_page == CP_UTF7) {
7424 /* CP_UTF7 only supports flags=0 */
7425 return 0;
7426 }
7427 else {
7428 if (errors != NULL && strcmp(errors, "replace") == 0)
7429 return 0;
7430 else
7431 return WC_NO_BEST_FIT_CHARS;
7432 }
7433}
7434
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 * Encode a Unicode string to a Windows code page into a byte string in strict
7437 * mode.
7438 *
7439 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007440 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007441 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007442static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007443encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007444 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007446{
Victor Stinner554f3f02010-06-16 23:33:54 +00007447 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 BOOL *pusedDefaultChar = &usedDefaultChar;
7449 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007450 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007451 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 const DWORD flags = encode_code_page_flags(code_page, NULL);
7453 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007454 /* Create a substring so that we can get the UTF-16 representation
7455 of just the slice under consideration. */
7456 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007457
Martin v. Löwis3d325192011-11-04 18:23:06 +01007458 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007461 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007463 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007464
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 substring = PyUnicode_Substring(unicode, offset, offset+len);
7466 if (substring == NULL)
7467 return -1;
7468 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7469 if (p == NULL) {
7470 Py_DECREF(substring);
7471 return -1;
7472 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007473 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007474
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007475 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007477 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 NULL, 0,
7479 NULL, pusedDefaultChar);
7480 if (outsize <= 0)
7481 goto error;
7482 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 if (pusedDefaultChar && *pusedDefaultChar) {
7484 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007486 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007487
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 if (*outbytes == NULL) {
7492 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496 }
7497 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 const Py_ssize_t n = PyBytes_Size(*outbytes);
7500 if (outsize > PY_SSIZE_T_MAX - n) {
7501 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7506 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007508 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510 }
7511
7512 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007514 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 out, outsize,
7516 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 if (outsize <= 0)
7519 goto error;
7520 if (pusedDefaultChar && *pusedDefaultChar)
7521 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007523
Victor Stinner3a50e702011-10-18 21:21:00 +02007524error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007525 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7527 return -2;
7528 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007529 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007530}
7531
Victor Stinner3a50e702011-10-18 21:21:00 +02007532/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007533 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 * error handler.
7535 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007536 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 * -1 on other error.
7538 */
7539static int
7540encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007541 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007542 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007543{
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007545 Py_ssize_t pos = unicode_offset;
7546 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 /* Ideally, we should get reason from FormatMessage. This is the Windows
7548 2000 English version of the message. */
7549 const char *reason = "invalid character";
7550 /* 4=maximum length of a UTF-8 sequence */
7551 char buffer[4];
7552 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7553 Py_ssize_t outsize;
7554 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007555 PyObject *errorHandler = NULL;
7556 PyObject *exc = NULL;
7557 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007558 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 PyObject *rep;
7561 int ret = -1;
7562
7563 assert(insize > 0);
7564
7565 encoding = code_page_name(code_page, &encoding_obj);
7566 if (encoding == NULL)
7567 return -1;
7568
7569 if (errors == NULL || strcmp(errors, "strict") == 0) {
7570 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7571 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007572 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 if (exc != NULL) {
7574 PyCodec_StrictErrors(exc);
7575 Py_DECREF(exc);
7576 }
7577 Py_XDECREF(encoding_obj);
7578 return -1;
7579 }
7580
7581 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7582 pusedDefaultChar = &usedDefaultChar;
7583 else
7584 pusedDefaultChar = NULL;
7585
7586 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7587 PyErr_NoMemory();
7588 goto error;
7589 }
7590 outsize = insize * Py_ARRAY_LENGTH(buffer);
7591
7592 if (*outbytes == NULL) {
7593 /* Create string object */
7594 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7595 if (*outbytes == NULL)
7596 goto error;
7597 out = PyBytes_AS_STRING(*outbytes);
7598 }
7599 else {
7600 /* Extend string object */
7601 Py_ssize_t n = PyBytes_Size(*outbytes);
7602 if (n > PY_SSIZE_T_MAX - outsize) {
7603 PyErr_NoMemory();
7604 goto error;
7605 }
7606 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7607 goto error;
7608 out = PyBytes_AS_STRING(*outbytes) + n;
7609 }
7610
7611 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007612 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007614 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7615 wchar_t chars[2];
7616 int charsize;
7617 if (ch < 0x10000) {
7618 chars[0] = (wchar_t)ch;
7619 charsize = 1;
7620 }
7621 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007622 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7623 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007624 charsize = 2;
7625 }
7626
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007629 buffer, Py_ARRAY_LENGTH(buffer),
7630 NULL, pusedDefaultChar);
7631 if (outsize > 0) {
7632 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7633 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007634 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007635 memcpy(out, buffer, outsize);
7636 out += outsize;
7637 continue;
7638 }
7639 }
7640 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7641 PyErr_SetFromWindowsErr(0);
7642 goto error;
7643 }
7644
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 rep = unicode_encode_call_errorhandler(
7646 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007647 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 if (rep == NULL)
7650 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007652
7653 if (PyBytes_Check(rep)) {
7654 outsize = PyBytes_GET_SIZE(rep);
7655 if (outsize != 1) {
7656 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7657 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7658 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7659 Py_DECREF(rep);
7660 goto error;
7661 }
7662 out = PyBytes_AS_STRING(*outbytes) + offset;
7663 }
7664 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7665 out += outsize;
7666 }
7667 else {
7668 Py_ssize_t i;
7669 enum PyUnicode_Kind kind;
7670 void *data;
7671
Benjamin Petersonbac79492012-01-14 13:34:47 -05007672 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 Py_DECREF(rep);
7674 goto error;
7675 }
7676
7677 outsize = PyUnicode_GET_LENGTH(rep);
7678 if (outsize != 1) {
7679 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7680 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7681 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7682 Py_DECREF(rep);
7683 goto error;
7684 }
7685 out = PyBytes_AS_STRING(*outbytes) + offset;
7686 }
7687 kind = PyUnicode_KIND(rep);
7688 data = PyUnicode_DATA(rep);
7689 for (i=0; i < outsize; i++) {
7690 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7691 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007692 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007693 encoding, unicode,
7694 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 "unable to encode error handler result to ASCII");
7696 Py_DECREF(rep);
7697 goto error;
7698 }
7699 *out = (unsigned char)ch;
7700 out++;
7701 }
7702 }
7703 Py_DECREF(rep);
7704 }
7705 /* write a NUL byte */
7706 *out = 0;
7707 outsize = out - PyBytes_AS_STRING(*outbytes);
7708 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7709 if (_PyBytes_Resize(outbytes, outsize) < 0)
7710 goto error;
7711 ret = 0;
7712
7713error:
7714 Py_XDECREF(encoding_obj);
7715 Py_XDECREF(errorHandler);
7716 Py_XDECREF(exc);
7717 return ret;
7718}
7719
Victor Stinner3a50e702011-10-18 21:21:00 +02007720static PyObject *
7721encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007722 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007723 const char *errors)
7724{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007726 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007727 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007728 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007729
Victor Stinner29dacf22015-01-26 16:41:32 +01007730 if (!PyUnicode_Check(unicode)) {
7731 PyErr_BadArgument();
7732 return NULL;
7733 }
7734
Benjamin Petersonbac79492012-01-14 13:34:47 -05007735 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007736 return NULL;
7737 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007738
Victor Stinner3a50e702011-10-18 21:21:00 +02007739 if (code_page < 0) {
7740 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7741 return NULL;
7742 }
7743
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 return PyBytes_FromStringAndSize(NULL, 0);
7746
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 offset = 0;
7748 do
7749 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007750#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007751 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007752 chunks. */
7753 if (len > INT_MAX/2) {
7754 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007755 done = 0;
7756 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007758#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007759 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007760 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007761 done = 1;
7762 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007763
Victor Stinner76a31a62011-11-04 00:05:13 +01007764 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007765 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007766 errors);
7767 if (ret == -2)
7768 ret = encode_code_page_errors(code_page, &outbytes,
7769 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007770 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007771 if (ret < 0) {
7772 Py_XDECREF(outbytes);
7773 return NULL;
7774 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007775
Victor Stinner7581cef2011-11-03 22:32:33 +01007776 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007777 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007778 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007779
Victor Stinner3a50e702011-10-18 21:21:00 +02007780 return outbytes;
7781}
7782
7783PyObject *
7784PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7785 Py_ssize_t size,
7786 const char *errors)
7787{
Victor Stinner7581cef2011-11-03 22:32:33 +01007788 PyObject *unicode, *res;
7789 unicode = PyUnicode_FromUnicode(p, size);
7790 if (unicode == NULL)
7791 return NULL;
7792 res = encode_code_page(CP_ACP, unicode, errors);
7793 Py_DECREF(unicode);
7794 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007795}
7796
7797PyObject *
7798PyUnicode_EncodeCodePage(int code_page,
7799 PyObject *unicode,
7800 const char *errors)
7801{
Victor Stinner7581cef2011-11-03 22:32:33 +01007802 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007803}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007804
Alexander Belopolsky40018472011-02-26 01:02:56 +00007805PyObject *
7806PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007807{
Victor Stinner7581cef2011-11-03 22:32:33 +01007808 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007809}
7810
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007811#undef NEED_RETRY
7812
Steve Dowercc16be82016-09-08 10:35:16 -07007813#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007814
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815/* --- Character Mapping Codec -------------------------------------------- */
7816
Victor Stinnerfb161b12013-04-18 01:44:27 +02007817static int
7818charmap_decode_string(const char *s,
7819 Py_ssize_t size,
7820 PyObject *mapping,
7821 const char *errors,
7822 _PyUnicodeWriter *writer)
7823{
7824 const char *starts = s;
7825 const char *e;
7826 Py_ssize_t startinpos, endinpos;
7827 PyObject *errorHandler = NULL, *exc = NULL;
7828 Py_ssize_t maplen;
7829 enum PyUnicode_Kind mapkind;
7830 void *mapdata;
7831 Py_UCS4 x;
7832 unsigned char ch;
7833
7834 if (PyUnicode_READY(mapping) == -1)
7835 return -1;
7836
7837 maplen = PyUnicode_GET_LENGTH(mapping);
7838 mapdata = PyUnicode_DATA(mapping);
7839 mapkind = PyUnicode_KIND(mapping);
7840
7841 e = s + size;
7842
7843 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7844 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7845 * is disabled in encoding aliases, latin1 is preferred because
7846 * its implementation is faster. */
7847 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7848 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7849 Py_UCS4 maxchar = writer->maxchar;
7850
7851 assert (writer->kind == PyUnicode_1BYTE_KIND);
7852 while (s < e) {
7853 ch = *s;
7854 x = mapdata_ucs1[ch];
7855 if (x > maxchar) {
7856 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7857 goto onError;
7858 maxchar = writer->maxchar;
7859 outdata = (Py_UCS1 *)writer->data;
7860 }
7861 outdata[writer->pos] = x;
7862 writer->pos++;
7863 ++s;
7864 }
7865 return 0;
7866 }
7867
7868 while (s < e) {
7869 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7870 enum PyUnicode_Kind outkind = writer->kind;
7871 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7872 if (outkind == PyUnicode_1BYTE_KIND) {
7873 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7874 Py_UCS4 maxchar = writer->maxchar;
7875 while (s < e) {
7876 ch = *s;
7877 x = mapdata_ucs2[ch];
7878 if (x > maxchar)
7879 goto Error;
7880 outdata[writer->pos] = x;
7881 writer->pos++;
7882 ++s;
7883 }
7884 break;
7885 }
7886 else if (outkind == PyUnicode_2BYTE_KIND) {
7887 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7888 while (s < e) {
7889 ch = *s;
7890 x = mapdata_ucs2[ch];
7891 if (x == 0xFFFE)
7892 goto Error;
7893 outdata[writer->pos] = x;
7894 writer->pos++;
7895 ++s;
7896 }
7897 break;
7898 }
7899 }
7900 ch = *s;
7901
7902 if (ch < maplen)
7903 x = PyUnicode_READ(mapkind, mapdata, ch);
7904 else
7905 x = 0xfffe; /* invalid value */
7906Error:
7907 if (x == 0xfffe)
7908 {
7909 /* undefined mapping */
7910 startinpos = s-starts;
7911 endinpos = startinpos+1;
7912 if (unicode_decode_call_errorhandler_writer(
7913 errors, &errorHandler,
7914 "charmap", "character maps to <undefined>",
7915 &starts, &e, &startinpos, &endinpos, &exc, &s,
7916 writer)) {
7917 goto onError;
7918 }
7919 continue;
7920 }
7921
7922 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7923 goto onError;
7924 ++s;
7925 }
7926 Py_XDECREF(errorHandler);
7927 Py_XDECREF(exc);
7928 return 0;
7929
7930onError:
7931 Py_XDECREF(errorHandler);
7932 Py_XDECREF(exc);
7933 return -1;
7934}
7935
7936static int
7937charmap_decode_mapping(const char *s,
7938 Py_ssize_t size,
7939 PyObject *mapping,
7940 const char *errors,
7941 _PyUnicodeWriter *writer)
7942{
7943 const char *starts = s;
7944 const char *e;
7945 Py_ssize_t startinpos, endinpos;
7946 PyObject *errorHandler = NULL, *exc = NULL;
7947 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007948 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007949
7950 e = s + size;
7951
7952 while (s < e) {
7953 ch = *s;
7954
7955 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7956 key = PyLong_FromLong((long)ch);
7957 if (key == NULL)
7958 goto onError;
7959
7960 item = PyObject_GetItem(mapping, key);
7961 Py_DECREF(key);
7962 if (item == NULL) {
7963 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7964 /* No mapping found means: mapping is undefined. */
7965 PyErr_Clear();
7966 goto Undefined;
7967 } else
7968 goto onError;
7969 }
7970
7971 /* Apply mapping */
7972 if (item == Py_None)
7973 goto Undefined;
7974 if (PyLong_Check(item)) {
7975 long value = PyLong_AS_LONG(item);
7976 if (value == 0xFFFE)
7977 goto Undefined;
7978 if (value < 0 || value > MAX_UNICODE) {
7979 PyErr_Format(PyExc_TypeError,
7980 "character mapping must be in range(0x%lx)",
7981 (unsigned long)MAX_UNICODE + 1);
7982 goto onError;
7983 }
7984
7985 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7986 goto onError;
7987 }
7988 else if (PyUnicode_Check(item)) {
7989 if (PyUnicode_READY(item) == -1)
7990 goto onError;
7991 if (PyUnicode_GET_LENGTH(item) == 1) {
7992 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7993 if (value == 0xFFFE)
7994 goto Undefined;
7995 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7996 goto onError;
7997 }
7998 else {
7999 writer->overallocate = 1;
8000 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8001 goto onError;
8002 }
8003 }
8004 else {
8005 /* wrong return value */
8006 PyErr_SetString(PyExc_TypeError,
8007 "character mapping must return integer, None or str");
8008 goto onError;
8009 }
8010 Py_CLEAR(item);
8011 ++s;
8012 continue;
8013
8014Undefined:
8015 /* undefined mapping */
8016 Py_CLEAR(item);
8017 startinpos = s-starts;
8018 endinpos = startinpos+1;
8019 if (unicode_decode_call_errorhandler_writer(
8020 errors, &errorHandler,
8021 "charmap", "character maps to <undefined>",
8022 &starts, &e, &startinpos, &endinpos, &exc, &s,
8023 writer)) {
8024 goto onError;
8025 }
8026 }
8027 Py_XDECREF(errorHandler);
8028 Py_XDECREF(exc);
8029 return 0;
8030
8031onError:
8032 Py_XDECREF(item);
8033 Py_XDECREF(errorHandler);
8034 Py_XDECREF(exc);
8035 return -1;
8036}
8037
Alexander Belopolsky40018472011-02-26 01:02:56 +00008038PyObject *
8039PyUnicode_DecodeCharmap(const char *s,
8040 Py_ssize_t size,
8041 PyObject *mapping,
8042 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008044 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008045
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 /* Default to Latin-1 */
8047 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008051 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008052 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008053 writer.min_length = size;
8054 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008056
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008057 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008058 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8059 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008060 }
8061 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008062 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8063 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008065 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008066
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008068 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 return NULL;
8070}
8071
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072/* Charmap encoding: the lookup table */
8073
Alexander Belopolsky40018472011-02-26 01:02:56 +00008074struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 PyObject_HEAD
8076 unsigned char level1[32];
8077 int count2, count3;
8078 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079};
8080
8081static PyObject*
8082encoding_map_size(PyObject *obj, PyObject* args)
8083{
8084 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087}
8088
8089static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008090 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 PyDoc_STR("Return the size (in bytes) of this object") },
8092 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093};
8094
8095static void
8096encoding_map_dealloc(PyObject* o)
8097{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099}
8100
8101static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008102 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 "EncodingMap", /*tp_name*/
8104 sizeof(struct encoding_map), /*tp_basicsize*/
8105 0, /*tp_itemsize*/
8106 /* methods */
8107 encoding_map_dealloc, /*tp_dealloc*/
8108 0, /*tp_print*/
8109 0, /*tp_getattr*/
8110 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008111 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 0, /*tp_repr*/
8113 0, /*tp_as_number*/
8114 0, /*tp_as_sequence*/
8115 0, /*tp_as_mapping*/
8116 0, /*tp_hash*/
8117 0, /*tp_call*/
8118 0, /*tp_str*/
8119 0, /*tp_getattro*/
8120 0, /*tp_setattro*/
8121 0, /*tp_as_buffer*/
8122 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8123 0, /*tp_doc*/
8124 0, /*tp_traverse*/
8125 0, /*tp_clear*/
8126 0, /*tp_richcompare*/
8127 0, /*tp_weaklistoffset*/
8128 0, /*tp_iter*/
8129 0, /*tp_iternext*/
8130 encoding_map_methods, /*tp_methods*/
8131 0, /*tp_members*/
8132 0, /*tp_getset*/
8133 0, /*tp_base*/
8134 0, /*tp_dict*/
8135 0, /*tp_descr_get*/
8136 0, /*tp_descr_set*/
8137 0, /*tp_dictoffset*/
8138 0, /*tp_init*/
8139 0, /*tp_alloc*/
8140 0, /*tp_new*/
8141 0, /*tp_free*/
8142 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143};
8144
8145PyObject*
8146PyUnicode_BuildEncodingMap(PyObject* string)
8147{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 PyObject *result;
8149 struct encoding_map *mresult;
8150 int i;
8151 int need_dict = 0;
8152 unsigned char level1[32];
8153 unsigned char level2[512];
8154 unsigned char *mlevel1, *mlevel2, *mlevel3;
8155 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 int kind;
8157 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008158 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008161 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 PyErr_BadArgument();
8163 return NULL;
8164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 kind = PyUnicode_KIND(string);
8166 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008167 length = PyUnicode_GET_LENGTH(string);
8168 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 memset(level1, 0xFF, sizeof level1);
8170 memset(level2, 0xFF, sizeof level2);
8171
8172 /* If there isn't a one-to-one mapping of NULL to \0,
8173 or if there are non-BMP characters, we need to use
8174 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008177 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008179 ch = PyUnicode_READ(kind, data, i);
8180 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181 need_dict = 1;
8182 break;
8183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008184 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185 /* unmapped character */
8186 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 l1 = ch >> 11;
8188 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 if (level1[l1] == 0xFF)
8190 level1[l1] = count2++;
8191 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 }
8194
8195 if (count2 >= 0xFF || count3 >= 0xFF)
8196 need_dict = 1;
8197
8198 if (need_dict) {
8199 PyObject *result = PyDict_New();
8200 PyObject *key, *value;
8201 if (!result)
8202 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008203 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008205 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008206 if (!key || !value)
8207 goto failed1;
8208 if (PyDict_SetItem(result, key, value) == -1)
8209 goto failed1;
8210 Py_DECREF(key);
8211 Py_DECREF(value);
8212 }
8213 return result;
8214 failed1:
8215 Py_XDECREF(key);
8216 Py_XDECREF(value);
8217 Py_DECREF(result);
8218 return NULL;
8219 }
8220
8221 /* Create a three-level trie */
8222 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8223 16*count2 + 128*count3 - 1);
8224 if (!result)
8225 return PyErr_NoMemory();
8226 PyObject_Init(result, &EncodingMapType);
8227 mresult = (struct encoding_map*)result;
8228 mresult->count2 = count2;
8229 mresult->count3 = count3;
8230 mlevel1 = mresult->level1;
8231 mlevel2 = mresult->level23;
8232 mlevel3 = mresult->level23 + 16*count2;
8233 memcpy(mlevel1, level1, 32);
8234 memset(mlevel2, 0xFF, 16*count2);
8235 memset(mlevel3, 0, 128*count3);
8236 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008237 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008239 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8240 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008241 /* unmapped character */
8242 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008243 o1 = ch>>11;
8244 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245 i2 = 16*mlevel1[o1] + o2;
8246 if (mlevel2[i2] == 0xFF)
8247 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008248 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249 i3 = 128*mlevel2[i2] + o3;
8250 mlevel3[i3] = i;
8251 }
8252 return result;
8253}
8254
8255static int
Victor Stinner22168992011-11-20 17:09:18 +01008256encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257{
8258 struct encoding_map *map = (struct encoding_map*)mapping;
8259 int l1 = c>>11;
8260 int l2 = (c>>7) & 0xF;
8261 int l3 = c & 0x7F;
8262 int i;
8263
Victor Stinner22168992011-11-20 17:09:18 +01008264 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 if (c == 0)
8267 return 0;
8268 /* level 1*/
8269 i = map->level1[l1];
8270 if (i == 0xFF) {
8271 return -1;
8272 }
8273 /* level 2*/
8274 i = map->level23[16*i+l2];
8275 if (i == 0xFF) {
8276 return -1;
8277 }
8278 /* level 3 */
8279 i = map->level23[16*map->count2 + 128*i + l3];
8280 if (i == 0) {
8281 return -1;
8282 }
8283 return i;
8284}
8285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286/* Lookup the character ch in the mapping. If the character
8287 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008288 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008289static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008290charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291{
Christian Heimes217cfd12007-12-02 14:31:20 +00008292 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293 PyObject *x;
8294
8295 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 x = PyObject_GetItem(mapping, w);
8298 Py_DECREF(w);
8299 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8301 /* No mapping found means: mapping is undefined. */
8302 PyErr_Clear();
8303 x = Py_None;
8304 Py_INCREF(x);
8305 return x;
8306 } else
8307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008309 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008311 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 long value = PyLong_AS_LONG(x);
8313 if (value < 0 || value > 255) {
8314 PyErr_SetString(PyExc_TypeError,
8315 "character mapping must be in range(256)");
8316 Py_DECREF(x);
8317 return NULL;
8318 }
8319 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008321 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 /* wrong return value */
8325 PyErr_Format(PyExc_TypeError,
8326 "character mapping must return integer, bytes or None, not %.400s",
8327 x->ob_type->tp_name);
8328 Py_DECREF(x);
8329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 }
8331}
8332
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008334charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008336 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8337 /* exponentially overallocate to minimize reallocations */
8338 if (requiredsize < 2*outsize)
8339 requiredsize = 2*outsize;
8340 if (_PyBytes_Resize(outobj, requiredsize))
8341 return -1;
8342 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343}
8344
Benjamin Peterson14339b62009-01-31 16:36:08 +00008345typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008347} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008349 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 space is available. Return a new reference to the object that
8351 was put in the output buffer, or Py_None, if the mapping was undefined
8352 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008353 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008355charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 PyObject *rep;
8359 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008360 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361
Christian Heimes90aa7642007-12-19 02:45:37 +00008362 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 if (res == -1)
8366 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 if (outsize<requiredsize)
8368 if (charmapencode_resize(outobj, outpos, requiredsize))
8369 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008370 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 outstart[(*outpos)++] = (char)res;
8372 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008373 }
8374
8375 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 Py_DECREF(rep);
8380 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008381 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 if (PyLong_Check(rep)) {
8383 Py_ssize_t requiredsize = *outpos+1;
8384 if (outsize<requiredsize)
8385 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8386 Py_DECREF(rep);
8387 return enc_EXCEPTION;
8388 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008389 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 else {
8393 const char *repchars = PyBytes_AS_STRING(rep);
8394 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8395 Py_ssize_t requiredsize = *outpos+repsize;
8396 if (outsize<requiredsize)
8397 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8398 Py_DECREF(rep);
8399 return enc_EXCEPTION;
8400 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008401 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 memcpy(outstart + *outpos, repchars, repsize);
8403 *outpos += repsize;
8404 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008406 Py_DECREF(rep);
8407 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408}
8409
8410/* handle an error in PyUnicode_EncodeCharmap
8411 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008412static int
8413charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008414 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008416 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008417 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418{
8419 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008420 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008421 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008422 enum PyUnicode_Kind kind;
8423 void *data;
8424 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008426 Py_ssize_t collstartpos = *inpos;
8427 Py_ssize_t collendpos = *inpos+1;
8428 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 char *encoding = "charmap";
8430 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008431 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008432 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008433 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434
Benjamin Petersonbac79492012-01-14 13:34:47 -05008435 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008436 return -1;
8437 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 /* find all unencodable characters */
8439 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008440 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008441 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008442 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008443 val = encoding_map_lookup(ch, mapping);
8444 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 break;
8446 ++collendpos;
8447 continue;
8448 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008449
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008450 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8451 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 if (rep==NULL)
8453 return -1;
8454 else if (rep!=Py_None) {
8455 Py_DECREF(rep);
8456 break;
8457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 }
8461 /* cache callback name lookup
8462 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008463 if (*error_handler == _Py_ERROR_UNKNOWN)
8464 *error_handler = get_error_handler(errors);
8465
8466 switch (*error_handler) {
8467 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008468 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008470
8471 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 x = charmapencode_output('?', mapping, res, respos);
8474 if (x==enc_EXCEPTION) {
8475 return -1;
8476 }
8477 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008478 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 return -1;
8480 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008481 }
8482 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008483 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 *inpos = collendpos;
8485 break;
Victor Stinner50149202015-09-22 00:26:54 +02008486
8487 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008488 /* generate replacement (temporarily (mis)uses p) */
8489 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 char buffer[2+29+1+1];
8491 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008492 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 for (cp = buffer; *cp; ++cp) {
8494 x = charmapencode_output(*cp, mapping, res, respos);
8495 if (x==enc_EXCEPTION)
8496 return -1;
8497 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008498 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return -1;
8500 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008501 }
8502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008503 *inpos = collendpos;
8504 break;
Victor Stinner50149202015-09-22 00:26:54 +02008505
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 default:
Victor Stinner50149202015-09-22 00:26:54 +02008507 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008508 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008512 if (PyBytes_Check(repunicode)) {
8513 /* Directly copy bytes result to output. */
8514 Py_ssize_t outsize = PyBytes_Size(*res);
8515 Py_ssize_t requiredsize;
8516 repsize = PyBytes_Size(repunicode);
8517 requiredsize = *respos + repsize;
8518 if (requiredsize > outsize)
8519 /* Make room for all additional bytes. */
8520 if (charmapencode_resize(res, respos, requiredsize)) {
8521 Py_DECREF(repunicode);
8522 return -1;
8523 }
8524 memcpy(PyBytes_AsString(*res) + *respos,
8525 PyBytes_AsString(repunicode), repsize);
8526 *respos += repsize;
8527 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008528 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008529 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008530 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008531 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008532 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008533 Py_DECREF(repunicode);
8534 return -1;
8535 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008536 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008537 data = PyUnicode_DATA(repunicode);
8538 kind = PyUnicode_KIND(repunicode);
8539 for (index = 0; index < repsize; index++) {
8540 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8541 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008543 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 return -1;
8545 }
8546 else if (x==enc_FAILED) {
8547 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008548 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return -1;
8550 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008551 }
8552 *inpos = newpos;
8553 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 }
8555 return 0;
8556}
8557
Alexander Belopolsky40018472011-02-26 01:02:56 +00008558PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008559_PyUnicode_EncodeCharmap(PyObject *unicode,
8560 PyObject *mapping,
8561 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 /* output object */
8564 PyObject *res = NULL;
8565 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008566 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008569 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008570 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008572 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008573 void *data;
8574 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Benjamin Petersonbac79492012-01-14 13:34:47 -05008576 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008577 return NULL;
8578 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008579 data = PyUnicode_DATA(unicode);
8580 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008581
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 /* Default to Latin-1 */
8583 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586 /* allocate enough for a simple encoding without
8587 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008588 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 if (res == NULL)
8590 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008591 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008595 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008597 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 if (x==enc_EXCEPTION) /* error */
8599 goto onError;
8600 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008603 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 &res, &respos)) {
8605 goto onError;
8606 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 else
8609 /* done with this character => adjust input position */
8610 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008614 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008615 if (_PyBytes_Resize(&res, respos) < 0)
8616 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008619 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 return res;
8621
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 Py_XDECREF(res);
8624 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008625 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 return NULL;
8627}
8628
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008629/* Deprecated */
8630PyObject *
8631PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8632 Py_ssize_t size,
8633 PyObject *mapping,
8634 const char *errors)
8635{
8636 PyObject *result;
8637 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8638 if (unicode == NULL)
8639 return NULL;
8640 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8641 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008642 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008643}
8644
Alexander Belopolsky40018472011-02-26 01:02:56 +00008645PyObject *
8646PyUnicode_AsCharmapString(PyObject *unicode,
8647 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648{
8649 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 PyErr_BadArgument();
8651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008653 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654}
8655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008657static void
8658make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660 Py_ssize_t startpos, Py_ssize_t endpos,
8661 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 *exceptionObject = _PyUnicodeTranslateError_Create(
8665 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 }
8667 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8669 goto onError;
8670 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8671 goto onError;
8672 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8673 goto onError;
8674 return;
8675 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008676 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 }
8678}
8679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680/* error handling callback helper:
8681 build arguments, call the callback and check the arguments,
8682 put the result into newpos and return the replacement string, which
8683 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008684static PyObject *
8685unicode_translate_call_errorhandler(const char *errors,
8686 PyObject **errorHandler,
8687 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008689 Py_ssize_t startpos, Py_ssize_t endpos,
8690 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008692 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008694 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 PyObject *restuple;
8696 PyObject *resunicode;
8697
8698 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 }
8703
8704 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708
8709 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008714 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 Py_DECREF(restuple);
8716 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 }
8718 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 &resunicode, &i_newpos)) {
8720 Py_DECREF(restuple);
8721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008723 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008725 else
8726 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008728 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 Py_DECREF(restuple);
8730 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 Py_INCREF(resunicode);
8733 Py_DECREF(restuple);
8734 return resunicode;
8735}
8736
8737/* Lookup the character ch in the mapping and put the result in result,
8738 which must be decrefed by the caller.
8739 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008740static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742{
Christian Heimes217cfd12007-12-02 14:31:20 +00008743 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744 PyObject *x;
8745
8746 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748 x = PyObject_GetItem(mapping, w);
8749 Py_DECREF(w);
8750 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8752 /* No mapping found means: use 1:1 mapping. */
8753 PyErr_Clear();
8754 *result = NULL;
8755 return 0;
8756 } else
8757 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 }
8759 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 *result = x;
8761 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008763 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008765 if (value < 0 || value > MAX_UNICODE) {
8766 PyErr_Format(PyExc_ValueError,
8767 "character mapping must be in range(0x%x)",
8768 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 Py_DECREF(x);
8770 return -1;
8771 }
8772 *result = x;
8773 return 0;
8774 }
8775 else if (PyUnicode_Check(x)) {
8776 *result = x;
8777 return 0;
8778 }
8779 else {
8780 /* wrong return value */
8781 PyErr_SetString(PyExc_TypeError,
8782 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008783 Py_DECREF(x);
8784 return -1;
8785 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786}
Victor Stinner1194ea02014-04-04 19:37:40 +02008787
8788/* lookup the character, write the result into the writer.
8789 Return 1 if the result was written into the writer, return 0 if the mapping
8790 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008791static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008792charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8793 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008794{
Victor Stinner1194ea02014-04-04 19:37:40 +02008795 PyObject *item;
8796
8797 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008799
8800 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008802 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008805 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008806 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008807
8808 if (item == Py_None) {
8809 Py_DECREF(item);
8810 return 0;
8811 }
8812
8813 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008814 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8815 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8816 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008817 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8818 Py_DECREF(item);
8819 return -1;
8820 }
8821 Py_DECREF(item);
8822 return 1;
8823 }
8824
8825 if (!PyUnicode_Check(item)) {
8826 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008828 }
8829
8830 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8831 Py_DECREF(item);
8832 return -1;
8833 }
8834
8835 Py_DECREF(item);
8836 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008837}
8838
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839static int
8840unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8841 Py_UCS1 *translate)
8842{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008843 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008844 int ret = 0;
8845
Victor Stinner89a76ab2014-04-05 11:44:04 +02008846 if (charmaptranslate_lookup(ch, mapping, &item)) {
8847 return -1;
8848 }
8849
8850 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008851 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008852 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008854 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008855 /* not found => default to 1:1 mapping */
8856 translate[ch] = ch;
8857 return 1;
8858 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008859 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008860 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008861 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8862 used it */
8863 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864 /* invalid character or character outside ASCII:
8865 skip the fast translate */
8866 goto exit;
8867 }
8868 translate[ch] = (Py_UCS1)replace;
8869 }
8870 else if (PyUnicode_Check(item)) {
8871 Py_UCS4 replace;
8872
8873 if (PyUnicode_READY(item) == -1) {
8874 Py_DECREF(item);
8875 return -1;
8876 }
8877 if (PyUnicode_GET_LENGTH(item) != 1)
8878 goto exit;
8879
8880 replace = PyUnicode_READ_CHAR(item, 0);
8881 if (replace > 127)
8882 goto exit;
8883 translate[ch] = (Py_UCS1)replace;
8884 }
8885 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008886 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887 goto exit;
8888 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008889 ret = 1;
8890
Benjamin Peterson1365de72014-04-07 20:15:41 -04008891 exit:
8892 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 return ret;
8894}
8895
8896/* Fast path for ascii => ascii translation. Return 1 if the whole string
8897 was translated into writer, return 0 if the input string was partially
8898 translated into writer, raise an exception and return -1 on error. */
8899static int
8900unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008901 _PyUnicodeWriter *writer, int ignore,
8902 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008903{
Victor Stinner872b2912014-04-05 14:27:07 +02008904 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008905 Py_ssize_t len;
8906 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008907 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909 len = PyUnicode_GET_LENGTH(input);
8910
Victor Stinner872b2912014-04-05 14:27:07 +02008911 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912
8913 in = PyUnicode_1BYTE_DATA(input);
8914 end = in + len;
8915
8916 assert(PyUnicode_IS_ASCII(writer->buffer));
8917 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8918 out = PyUnicode_1BYTE_DATA(writer->buffer);
8919
Victor Stinner872b2912014-04-05 14:27:07 +02008920 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008922 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008923 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008924 int translate = unicode_fast_translate_lookup(mapping, ch,
8925 ascii_table);
8926 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008927 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008928 if (translate == 0)
8929 goto exit;
8930 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 }
Victor Stinner872b2912014-04-05 14:27:07 +02008932 if (ch2 == 0xfe) {
8933 if (ignore)
8934 continue;
8935 goto exit;
8936 }
8937 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008938 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008939 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008940 }
Victor Stinner872b2912014-04-05 14:27:07 +02008941 res = 1;
8942
8943exit:
8944 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008945 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008946 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008947}
8948
Victor Stinner3222da22015-10-01 22:07:32 +02008949static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950_PyUnicode_TranslateCharmap(PyObject *input,
8951 PyObject *mapping,
8952 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 Py_ssize_t size, i;
8957 int kind;
8958 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008959 _PyUnicodeWriter writer;
8960 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961 char *reason = "character maps to <undefined>";
8962 PyObject *errorHandler = NULL;
8963 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008966
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 PyErr_BadArgument();
8969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 if (PyUnicode_READY(input) == -1)
8973 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008974 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 kind = PyUnicode_KIND(input);
8976 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008978 if (size == 0)
8979 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008981 /* allocate enough for a simple 1:1 translation without
8982 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 _PyUnicodeWriter_Init(&writer);
8984 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986
Victor Stinner872b2912014-04-05 14:27:07 +02008987 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8988
Victor Stinner33798672016-03-01 21:59:58 +01008989 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008990 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008991 if (PyUnicode_IS_ASCII(input)) {
8992 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8993 if (res < 0) {
8994 _PyUnicodeWriter_Dealloc(&writer);
8995 return NULL;
8996 }
8997 if (res == 1)
8998 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008999 }
Victor Stinner33798672016-03-01 21:59:58 +01009000 else {
9001 i = 0;
9002 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009006 int translate;
9007 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9008 Py_ssize_t newpos;
9009 /* startpos for collecting untranslatable chars */
9010 Py_ssize_t collstart;
9011 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
Victor Stinner1194ea02014-04-04 19:37:40 +02009014 ch = PyUnicode_READ(kind, data, i);
9015 translate = charmaptranslate_output(ch, mapping, &writer);
9016 if (translate < 0)
9017 goto onError;
9018
9019 if (translate != 0) {
9020 /* it worked => adjust input pointer */
9021 ++i;
9022 continue;
9023 }
9024
9025 /* untranslatable character */
9026 collstart = i;
9027 collend = i+1;
9028
9029 /* find all untranslatable characters */
9030 while (collend < size) {
9031 PyObject *x;
9032 ch = PyUnicode_READ(kind, data, collend);
9033 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009034 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 Py_XDECREF(x);
9036 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009038 ++collend;
9039 }
9040
9041 if (ignore) {
9042 i = collend;
9043 }
9044 else {
9045 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9046 reason, input, &exc,
9047 collstart, collend, &newpos);
9048 if (repunicode == NULL)
9049 goto onError;
9050 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009052 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009053 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009054 Py_DECREF(repunicode);
9055 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009056 }
9057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009058 Py_XDECREF(exc);
9059 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009060 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009063 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009064 Py_XDECREF(exc);
9065 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066 return NULL;
9067}
9068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069/* Deprecated. Use PyUnicode_Translate instead. */
9070PyObject *
9071PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9072 Py_ssize_t size,
9073 PyObject *mapping,
9074 const char *errors)
9075{
Christian Heimes5f520f42012-09-11 14:03:25 +02009076 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9078 if (!unicode)
9079 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009080 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9081 Py_DECREF(unicode);
9082 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083}
9084
Alexander Belopolsky40018472011-02-26 01:02:56 +00009085PyObject *
9086PyUnicode_Translate(PyObject *str,
9087 PyObject *mapping,
9088 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009090 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009091 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009092 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093}
Tim Petersced69f82003-09-16 20:30:58 +00009094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009096fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097{
9098 /* No need to call PyUnicode_READY(self) because this function is only
9099 called as a callback from fixup() which does it already. */
9100 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9101 const int kind = PyUnicode_KIND(self);
9102 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009103 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009104 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 Py_ssize_t i;
9106
9107 for (i = 0; i < len; ++i) {
9108 ch = PyUnicode_READ(kind, data, i);
9109 fixed = 0;
9110 if (ch > 127) {
9111 if (Py_UNICODE_ISSPACE(ch))
9112 fixed = ' ';
9113 else {
9114 const int decimal = Py_UNICODE_TODECIMAL(ch);
9115 if (decimal >= 0)
9116 fixed = '0' + decimal;
9117 }
9118 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009119 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009120 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 PyUnicode_WRITE(kind, data, i, fixed);
9122 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009123 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009124 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 }
9127
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009128 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129}
9130
9131PyObject *
9132_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9133{
9134 if (!PyUnicode_Check(unicode)) {
9135 PyErr_BadInternalCall();
9136 return NULL;
9137 }
9138 if (PyUnicode_READY(unicode) == -1)
9139 return NULL;
9140 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9141 /* If the string is already ASCII, just return the same string */
9142 Py_INCREF(unicode);
9143 return unicode;
9144 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009145 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146}
9147
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009148PyObject *
9149PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9150 Py_ssize_t length)
9151{
Victor Stinnerf0124502011-11-21 23:12:56 +01009152 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009153 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009154 Py_UCS4 maxchar;
9155 enum PyUnicode_Kind kind;
9156 void *data;
9157
Victor Stinner99d7ad02012-02-22 13:37:39 +01009158 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009159 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009160 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009161 if (ch > 127) {
9162 int decimal = Py_UNICODE_TODECIMAL(ch);
9163 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009164 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009165 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009166 }
9167 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009168
9169 /* Copy to a new string */
9170 decimal = PyUnicode_New(length, maxchar);
9171 if (decimal == NULL)
9172 return decimal;
9173 kind = PyUnicode_KIND(decimal);
9174 data = PyUnicode_DATA(decimal);
9175 /* Iterate over code points */
9176 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009177 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009178 if (ch > 127) {
9179 int decimal = Py_UNICODE_TODECIMAL(ch);
9180 if (decimal >= 0)
9181 ch = '0' + decimal;
9182 }
9183 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009185 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009186}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009187/* --- Decimal Encoder ---------------------------------------------------- */
9188
Alexander Belopolsky40018472011-02-26 01:02:56 +00009189int
9190PyUnicode_EncodeDecimal(Py_UNICODE *s,
9191 Py_ssize_t length,
9192 char *output,
9193 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009194{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009195 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009196 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009197 enum PyUnicode_Kind kind;
9198 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009199
9200 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 PyErr_BadArgument();
9202 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009203 }
9204
Victor Stinner42bf7752011-11-21 22:52:58 +01009205 unicode = PyUnicode_FromUnicode(s, length);
9206 if (unicode == NULL)
9207 return -1;
9208
Benjamin Petersonbac79492012-01-14 13:34:47 -05009209 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009210 Py_DECREF(unicode);
9211 return -1;
9212 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009213 kind = PyUnicode_KIND(unicode);
9214 data = PyUnicode_DATA(unicode);
9215
Victor Stinnerb84d7232011-11-22 01:50:07 +01009216 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009217 PyObject *exc;
9218 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009219 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009220 Py_ssize_t startpos;
9221
9222 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009223
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009225 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009226 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009227 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009228 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 decimal = Py_UNICODE_TODECIMAL(ch);
9230 if (decimal >= 0) {
9231 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009232 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 continue;
9234 }
9235 if (0 < ch && ch < 256) {
9236 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009237 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 continue;
9239 }
Victor Stinner6345be92011-11-25 20:09:01 +01009240
Victor Stinner42bf7752011-11-21 22:52:58 +01009241 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009242 exc = NULL;
9243 raise_encode_exception(&exc, "decimal", unicode,
9244 startpos, startpos+1,
9245 "invalid decimal Unicode string");
9246 Py_XDECREF(exc);
9247 Py_DECREF(unicode);
9248 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009249 }
9250 /* 0-terminate the output string */
9251 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009252 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009253 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009254}
9255
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256/* --- Helpers ------------------------------------------------------------ */
9257
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009258/* helper macro to fixup start/end slice values */
9259#define ADJUST_INDICES(start, end, len) \
9260 if (end > len) \
9261 end = len; \
9262 else if (end < 0) { \
9263 end += len; \
9264 if (end < 0) \
9265 end = 0; \
9266 } \
9267 if (start < 0) { \
9268 start += len; \
9269 if (start < 0) \
9270 start = 0; \
9271 }
9272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009274any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009276 Py_ssize_t end,
9277 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 void *buf1, *buf2;
9281 Py_ssize_t len1, len2, result;
9282
9283 kind1 = PyUnicode_KIND(s1);
9284 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009285 if (kind1 < kind2)
9286 return -1;
9287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 len1 = PyUnicode_GET_LENGTH(s1);
9289 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009290 ADJUST_INDICES(start, end, len1);
9291 if (end - start < len2)
9292 return -1;
9293
9294 buf1 = PyUnicode_DATA(s1);
9295 buf2 = PyUnicode_DATA(s2);
9296 if (len2 == 1) {
9297 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9298 result = findchar((const char *)buf1 + kind1*start,
9299 kind1, end - start, ch, direction);
9300 if (result == -1)
9301 return -1;
9302 else
9303 return start + result;
9304 }
9305
9306 if (kind2 != kind1) {
9307 buf2 = _PyUnicode_AsKind(s2, kind1);
9308 if (!buf2)
9309 return -2;
9310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311
Victor Stinner794d5672011-10-10 03:21:36 +02009312 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009313 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009314 case PyUnicode_1BYTE_KIND:
9315 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9316 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9317 else
9318 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9319 break;
9320 case PyUnicode_2BYTE_KIND:
9321 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9322 break;
9323 case PyUnicode_4BYTE_KIND:
9324 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9325 break;
9326 default:
9327 assert(0); result = -2;
9328 }
9329 }
9330 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009331 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009332 case PyUnicode_1BYTE_KIND:
9333 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9334 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9335 else
9336 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9337 break;
9338 case PyUnicode_2BYTE_KIND:
9339 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9340 break;
9341 case PyUnicode_4BYTE_KIND:
9342 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9343 break;
9344 default:
9345 assert(0); result = -2;
9346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
9348
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009349 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 PyMem_Free(buf2);
9351
9352 return result;
9353}
9354
9355Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009356_PyUnicode_InsertThousandsGrouping(
9357 PyObject *unicode, Py_ssize_t index,
9358 Py_ssize_t n_buffer,
9359 void *digits, Py_ssize_t n_digits,
9360 Py_ssize_t min_width,
9361 const char *grouping, PyObject *thousands_sep,
9362 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363{
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 Py_ssize_t thousands_sep_len;
9367 Py_ssize_t len;
9368
9369 if (unicode != NULL) {
9370 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009371 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 }
9373 else {
9374 kind = PyUnicode_1BYTE_KIND;
9375 data = NULL;
9376 }
9377 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9378 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9379 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9380 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009381 if (thousands_sep_kind < kind) {
9382 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9383 if (!thousands_sep_data)
9384 return -1;
9385 }
9386 else {
9387 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9388 if (!data)
9389 return -1;
9390 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009391 }
9392
Benjamin Petersonead6b532011-12-20 17:23:42 -06009393 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009395 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009396 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009397 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009399 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009400 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009401 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009402 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009403 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009404 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009405 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009407 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009408 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009409 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009410 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009411 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009413 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009414 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009415 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009416 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009417 break;
9418 default:
9419 assert(0);
9420 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009422 if (unicode != NULL && thousands_sep_kind != kind) {
9423 if (thousands_sep_kind < kind)
9424 PyMem_Free(thousands_sep_data);
9425 else
9426 PyMem_Free(data);
9427 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009428 if (unicode == NULL) {
9429 *maxchar = 127;
9430 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009431 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009432 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009433 }
9434 }
9435 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436}
9437
9438
Alexander Belopolsky40018472011-02-26 01:02:56 +00009439Py_ssize_t
9440PyUnicode_Count(PyObject *str,
9441 PyObject *substr,
9442 Py_ssize_t start,
9443 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009445 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 void *buf1 = NULL, *buf2 = NULL;
9448 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009449
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009450 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009451 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009452
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009453 kind1 = PyUnicode_KIND(str);
9454 kind2 = PyUnicode_KIND(substr);
9455 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009456 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009457
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009458 len1 = PyUnicode_GET_LENGTH(str);
9459 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009461 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009462 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009463
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009464 buf1 = PyUnicode_DATA(str);
9465 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009466 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009467 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009468 if (!buf2)
9469 goto onError;
9470 }
9471
9472 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009474 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009475 result = asciilib_count(
9476 ((Py_UCS1*)buf1) + start, end - start,
9477 buf2, len2, PY_SSIZE_T_MAX
9478 );
9479 else
9480 result = ucs1lib_count(
9481 ((Py_UCS1*)buf1) + start, end - start,
9482 buf2, len2, PY_SSIZE_T_MAX
9483 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 break;
9485 case PyUnicode_2BYTE_KIND:
9486 result = ucs2lib_count(
9487 ((Py_UCS2*)buf1) + start, end - start,
9488 buf2, len2, PY_SSIZE_T_MAX
9489 );
9490 break;
9491 case PyUnicode_4BYTE_KIND:
9492 result = ucs4lib_count(
9493 ((Py_UCS4*)buf1) + start, end - start,
9494 buf2, len2, PY_SSIZE_T_MAX
9495 );
9496 break;
9497 default:
9498 assert(0); result = 0;
9499 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009500
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009501 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 PyMem_Free(buf2);
9503
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009506 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 PyMem_Free(buf2);
9508 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509}
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511Py_ssize_t
9512PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009513 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009514 Py_ssize_t start,
9515 Py_ssize_t end,
9516 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009518 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009520
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009521 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522}
9523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524Py_ssize_t
9525PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9526 Py_ssize_t start, Py_ssize_t end,
9527 int direction)
9528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009530 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 if (PyUnicode_READY(str) == -1)
9532 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009533 if (start < 0 || end < 0) {
9534 PyErr_SetString(PyExc_IndexError, "string index out of range");
9535 return -2;
9536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 if (end > PyUnicode_GET_LENGTH(str))
9538 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009539 if (start >= end)
9540 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009542 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9543 kind, end-start, ch, direction);
9544 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009546 else
9547 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548}
9549
Alexander Belopolsky40018472011-02-26 01:02:56 +00009550static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009551tailmatch(PyObject *self,
9552 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009553 Py_ssize_t start,
9554 Py_ssize_t end,
9555 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 int kind_self;
9558 int kind_sub;
9559 void *data_self;
9560 void *data_sub;
9561 Py_ssize_t offset;
9562 Py_ssize_t i;
9563 Py_ssize_t end_sub;
9564
9565 if (PyUnicode_READY(self) == -1 ||
9566 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009567 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9570 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009574 if (PyUnicode_GET_LENGTH(substring) == 0)
9575 return 1;
9576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 kind_self = PyUnicode_KIND(self);
9578 data_self = PyUnicode_DATA(self);
9579 kind_sub = PyUnicode_KIND(substring);
9580 data_sub = PyUnicode_DATA(substring);
9581 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9582
9583 if (direction > 0)
9584 offset = end;
9585 else
9586 offset = start;
9587
9588 if (PyUnicode_READ(kind_self, data_self, offset) ==
9589 PyUnicode_READ(kind_sub, data_sub, 0) &&
9590 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9591 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9592 /* If both are of the same kind, memcmp is sufficient */
9593 if (kind_self == kind_sub) {
9594 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009595 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 data_sub,
9597 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009598 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009600 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 else {
9602 /* We do not need to compare 0 and len(substring)-1 because
9603 the if statement above ensured already that they are equal
9604 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 for (i = 1; i < end_sub; ++i) {
9606 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9607 PyUnicode_READ(kind_sub, data_sub, i))
9608 return 0;
9609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612 }
9613
9614 return 0;
9615}
9616
Alexander Belopolsky40018472011-02-26 01:02:56 +00009617Py_ssize_t
9618PyUnicode_Tailmatch(PyObject *str,
9619 PyObject *substr,
9620 Py_ssize_t start,
9621 Py_ssize_t end,
9622 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009624 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009626
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009627 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628}
9629
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630/* Apply fixfct filter to the Unicode object self and return a
9631 reference to the modified object */
9632
Alexander Belopolsky40018472011-02-26 01:02:56 +00009633static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009634fixup(PyObject *self,
9635 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 PyObject *u;
9638 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009639 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009641 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009644 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 /* fix functions return the new maximum character in a string,
9647 if the kind of the resulting unicode object does not change,
9648 everything is fine. Otherwise we need to change the string kind
9649 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009650 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009651
9652 if (maxchar_new == 0) {
9653 /* no changes */;
9654 if (PyUnicode_CheckExact(self)) {
9655 Py_DECREF(u);
9656 Py_INCREF(self);
9657 return self;
9658 }
9659 else
9660 return u;
9661 }
9662
Victor Stinnere6abb482012-05-02 01:15:40 +02009663 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664
Victor Stinnereaab6042011-12-11 22:22:39 +01009665 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009667
9668 /* In case the maximum character changed, we need to
9669 convert the string to the new category. */
9670 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9671 if (v == NULL) {
9672 Py_DECREF(u);
9673 return NULL;
9674 }
9675 if (maxchar_new > maxchar_old) {
9676 /* If the maxchar increased so that the kind changed, not all
9677 characters are representable anymore and we need to fix the
9678 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009679 _PyUnicode_FastCopyCharacters(v, 0,
9680 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009681 maxchar_old = fixfct(v);
9682 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 }
9684 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009685 _PyUnicode_FastCopyCharacters(v, 0,
9686 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009688 Py_DECREF(u);
9689 assert(_PyUnicode_CheckConsistency(v, 1));
9690 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691}
9692
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693static PyObject *
9694ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9697 char *resdata, *data = PyUnicode_DATA(self);
9698 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009699
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 res = PyUnicode_New(len, 127);
9701 if (res == NULL)
9702 return NULL;
9703 resdata = PyUnicode_DATA(res);
9704 if (lower)
9705 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 _Py_bytes_upper(resdata, data, len);
9708 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709}
9710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 Py_ssize_t j;
9715 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009716 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009718
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9720
9721 where ! is a negation and \p{xxx} is a character with property xxx.
9722 */
9723 for (j = i - 1; j >= 0; j--) {
9724 c = PyUnicode_READ(kind, data, j);
9725 if (!_PyUnicode_IsCaseIgnorable(c))
9726 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009728 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9729 if (final_sigma) {
9730 for (j = i + 1; j < length; j++) {
9731 c = PyUnicode_READ(kind, data, j);
9732 if (!_PyUnicode_IsCaseIgnorable(c))
9733 break;
9734 }
9735 final_sigma = j == length || !_PyUnicode_IsCased(c);
9736 }
9737 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738}
9739
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740static int
9741lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9742 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744 /* Obscure special case. */
9745 if (c == 0x3A3) {
9746 mapped[0] = handle_capital_sigma(kind, data, length, i);
9747 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750}
9751
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752static Py_ssize_t
9753do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 Py_ssize_t i, k = 0;
9756 int n_res, j;
9757 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009758
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 c = PyUnicode_READ(kind, data, 0);
9760 n_res = _PyUnicode_ToUpperFull(c, mapped);
9761 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009762 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009765 for (i = 1; i < length; i++) {
9766 c = PyUnicode_READ(kind, data, i);
9767 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009769 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009770 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009771 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009772 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774}
9775
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009776static Py_ssize_t
9777do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9778 Py_ssize_t i, k = 0;
9779
9780 for (i = 0; i < length; i++) {
9781 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9782 int n_res, j;
9783 if (Py_UNICODE_ISUPPER(c)) {
9784 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9785 }
9786 else if (Py_UNICODE_ISLOWER(c)) {
9787 n_res = _PyUnicode_ToUpperFull(c, mapped);
9788 }
9789 else {
9790 n_res = 1;
9791 mapped[0] = c;
9792 }
9793 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009794 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009795 res[k++] = mapped[j];
9796 }
9797 }
9798 return k;
9799}
9800
9801static Py_ssize_t
9802do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9803 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009805 Py_ssize_t i, k = 0;
9806
9807 for (i = 0; i < length; i++) {
9808 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9809 int n_res, j;
9810 if (lower)
9811 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9812 else
9813 n_res = _PyUnicode_ToUpperFull(c, mapped);
9814 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009815 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 res[k++] = mapped[j];
9817 }
9818 }
9819 return k;
9820}
9821
9822static Py_ssize_t
9823do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9824{
9825 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9826}
9827
9828static Py_ssize_t
9829do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9830{
9831 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9832}
9833
Benjamin Petersone51757f2012-01-12 21:10:29 -05009834static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009835do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9836{
9837 Py_ssize_t i, k = 0;
9838
9839 for (i = 0; i < length; i++) {
9840 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9841 Py_UCS4 mapped[3];
9842 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9843 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009844 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009845 res[k++] = mapped[j];
9846 }
9847 }
9848 return k;
9849}
9850
9851static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009852do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9853{
9854 Py_ssize_t i, k = 0;
9855 int previous_is_cased;
9856
9857 previous_is_cased = 0;
9858 for (i = 0; i < length; i++) {
9859 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9860 Py_UCS4 mapped[3];
9861 int n_res, j;
9862
9863 if (previous_is_cased)
9864 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9865 else
9866 n_res = _PyUnicode_ToTitleFull(c, mapped);
9867
9868 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009869 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009870 res[k++] = mapped[j];
9871 }
9872
9873 previous_is_cased = _PyUnicode_IsCased(c);
9874 }
9875 return k;
9876}
9877
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878static PyObject *
9879case_operation(PyObject *self,
9880 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9881{
9882 PyObject *res = NULL;
9883 Py_ssize_t length, newlength = 0;
9884 int kind, outkind;
9885 void *data, *outdata;
9886 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9887
Benjamin Petersoneea48462012-01-16 14:28:50 -05009888 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009889
9890 kind = PyUnicode_KIND(self);
9891 data = PyUnicode_DATA(self);
9892 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009893 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009894 PyErr_SetString(PyExc_OverflowError, "string is too long");
9895 return NULL;
9896 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009897 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009898 if (tmp == NULL)
9899 return PyErr_NoMemory();
9900 newlength = perform(kind, data, length, tmp, &maxchar);
9901 res = PyUnicode_New(newlength, maxchar);
9902 if (res == NULL)
9903 goto leave;
9904 tmpend = tmp + newlength;
9905 outdata = PyUnicode_DATA(res);
9906 outkind = PyUnicode_KIND(res);
9907 switch (outkind) {
9908 case PyUnicode_1BYTE_KIND:
9909 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9910 break;
9911 case PyUnicode_2BYTE_KIND:
9912 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9913 break;
9914 case PyUnicode_4BYTE_KIND:
9915 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9916 break;
9917 default:
9918 assert(0);
9919 break;
9920 }
9921 leave:
9922 PyMem_FREE(tmp);
9923 return res;
9924}
9925
Tim Peters8ce9f162004-08-27 01:49:32 +00009926PyObject *
9927PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009929 PyObject *res;
9930 PyObject *fseq;
9931 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009932 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009934 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009935 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009936 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009937 }
9938
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009939 /* NOTE: the following code can't call back into Python code,
9940 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009941 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009943 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009944 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009945 res = _PyUnicode_JoinArray(separator, items, seqlen);
9946 Py_DECREF(fseq);
9947 return res;
9948}
9949
9950PyObject *
9951_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9952{
9953 PyObject *res = NULL; /* the result */
9954 PyObject *sep = NULL;
9955 Py_ssize_t seplen;
9956 PyObject *item;
9957 Py_ssize_t sz, i, res_offset;
9958 Py_UCS4 maxchar;
9959 Py_UCS4 item_maxchar;
9960 int use_memcpy;
9961 unsigned char *res_data = NULL, *sep_data = NULL;
9962 PyObject *last_obj;
9963 unsigned int kind = 0;
9964
Tim Peters05eba1f2004-08-27 21:32:02 +00009965 /* If empty sequence, return u"". */
9966 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009967 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009968 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009969
Tim Peters05eba1f2004-08-27 21:32:02 +00009970 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009971 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009972 if (seqlen == 1) {
9973 if (PyUnicode_CheckExact(items[0])) {
9974 res = items[0];
9975 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009976 return res;
9977 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009978 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009979 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009980 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009981 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009982 /* Set up sep and seplen */
9983 if (separator == NULL) {
9984 /* fall back to a blank space separator */
9985 sep = PyUnicode_FromOrdinal(' ');
9986 if (!sep)
9987 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009988 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009989 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009990 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009991 else {
9992 if (!PyUnicode_Check(separator)) {
9993 PyErr_Format(PyExc_TypeError,
9994 "separator: expected str instance,"
9995 " %.80s found",
9996 Py_TYPE(separator)->tp_name);
9997 goto onError;
9998 }
9999 if (PyUnicode_READY(separator))
10000 goto onError;
10001 sep = separator;
10002 seplen = PyUnicode_GET_LENGTH(separator);
10003 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10004 /* inc refcount to keep this code path symmetric with the
10005 above case of a blank separator */
10006 Py_INCREF(sep);
10007 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010009 }
10010
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010011 /* There are at least two things to join, or else we have a subclass
10012 * of str in the sequence.
10013 * Do a pre-pass to figure out the total amount of space we'll
10014 * need (sz), and see whether all argument are strings.
10015 */
10016 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010017#ifdef Py_DEBUG
10018 use_memcpy = 0;
10019#else
10020 use_memcpy = 1;
10021#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010022 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010023 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010024 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 if (!PyUnicode_Check(item)) {
10026 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +020010027 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +000010028 " %.80s found",
10029 i, Py_TYPE(item)->tp_name);
10030 goto onError;
10031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 if (PyUnicode_READY(item) == -1)
10033 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010034 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010036 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010037 if (i != 0) {
10038 add_sz += seplen;
10039 }
10040 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010041 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010042 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010043 goto onError;
10044 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010045 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010046 if (use_memcpy && last_obj != NULL) {
10047 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10048 use_memcpy = 0;
10049 }
10050 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010051 }
Tim Petersced69f82003-09-16 20:30:58 +000010052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010054 if (res == NULL)
10055 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010056
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010057 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010058#ifdef Py_DEBUG
10059 use_memcpy = 0;
10060#else
10061 if (use_memcpy) {
10062 res_data = PyUnicode_1BYTE_DATA(res);
10063 kind = PyUnicode_KIND(res);
10064 if (seplen != 0)
10065 sep_data = PyUnicode_1BYTE_DATA(sep);
10066 }
10067#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010068 if (use_memcpy) {
10069 for (i = 0; i < seqlen; ++i) {
10070 Py_ssize_t itemlen;
10071 item = items[i];
10072
10073 /* Copy item, and maybe the separator. */
10074 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010075 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010076 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010077 kind * seplen);
10078 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010079 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010080
10081 itemlen = PyUnicode_GET_LENGTH(item);
10082 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010083 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010084 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010085 kind * itemlen);
10086 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010087 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010088 }
10089 assert(res_data == PyUnicode_1BYTE_DATA(res)
10090 + kind * PyUnicode_GET_LENGTH(res));
10091 }
10092 else {
10093 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10094 Py_ssize_t itemlen;
10095 item = items[i];
10096
10097 /* Copy item, and maybe the separator. */
10098 if (i && seplen != 0) {
10099 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10100 res_offset += seplen;
10101 }
10102
10103 itemlen = PyUnicode_GET_LENGTH(item);
10104 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010105 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010106 res_offset += itemlen;
10107 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010108 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010109 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010110 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010113 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
Benjamin Peterson29060642009-01-31 22:14:21 +000010116 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010118 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010119 return NULL;
10120}
10121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122#define FILL(kind, data, value, start, length) \
10123 do { \
10124 Py_ssize_t i_ = 0; \
10125 assert(kind != PyUnicode_WCHAR_KIND); \
10126 switch ((kind)) { \
10127 case PyUnicode_1BYTE_KIND: { \
10128 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010129 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 break; \
10131 } \
10132 case PyUnicode_2BYTE_KIND: { \
10133 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10134 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10135 break; \
10136 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010137 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10139 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10140 break; \
10141 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010142 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 } \
10144 } while (0)
10145
Victor Stinnerd3f08822012-05-29 12:57:52 +020010146void
10147_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10148 Py_UCS4 fill_char)
10149{
10150 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10151 const void *data = PyUnicode_DATA(unicode);
10152 assert(PyUnicode_IS_READY(unicode));
10153 assert(unicode_modifiable(unicode));
10154 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10155 assert(start >= 0);
10156 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10157 FILL(kind, data, fill_char, start, length);
10158}
10159
Victor Stinner3fe55312012-01-04 00:33:50 +010010160Py_ssize_t
10161PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10162 Py_UCS4 fill_char)
10163{
10164 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010165
10166 if (!PyUnicode_Check(unicode)) {
10167 PyErr_BadInternalCall();
10168 return -1;
10169 }
10170 if (PyUnicode_READY(unicode) == -1)
10171 return -1;
10172 if (unicode_check_modifiable(unicode))
10173 return -1;
10174
Victor Stinnerd3f08822012-05-29 12:57:52 +020010175 if (start < 0) {
10176 PyErr_SetString(PyExc_IndexError, "string index out of range");
10177 return -1;
10178 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010179 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10180 PyErr_SetString(PyExc_ValueError,
10181 "fill character is bigger than "
10182 "the string maximum character");
10183 return -1;
10184 }
10185
10186 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10187 length = Py_MIN(maxlen, length);
10188 if (length <= 0)
10189 return 0;
10190
Victor Stinnerd3f08822012-05-29 12:57:52 +020010191 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010192 return length;
10193}
10194
Victor Stinner9310abb2011-10-05 00:59:23 +020010195static PyObject *
10196pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010197 Py_ssize_t left,
10198 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 PyObject *u;
10202 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010203 int kind;
10204 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205
10206 if (left < 0)
10207 left = 0;
10208 if (right < 0)
10209 right = 0;
10210
Victor Stinnerc4b49542011-12-11 22:44:26 +010010211 if (left == 0 && right == 0)
10212 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10215 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010216 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10217 return NULL;
10218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010220 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010222 if (!u)
10223 return NULL;
10224
10225 kind = PyUnicode_KIND(u);
10226 data = PyUnicode_DATA(u);
10227 if (left)
10228 FILL(kind, data, fill, 0, left);
10229 if (right)
10230 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010231 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010232 assert(_PyUnicode_CheckConsistency(u, 1));
10233 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234}
10235
Alexander Belopolsky40018472011-02-26 01:02:56 +000010236PyObject *
10237PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010241 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
Benjamin Petersonead6b532011-12-20 17:23:42 -060010244 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010246 if (PyUnicode_IS_ASCII(string))
10247 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 PyUnicode_GET_LENGTH(string), keepends);
10250 else
10251 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010253 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 break;
10255 case PyUnicode_2BYTE_KIND:
10256 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 PyUnicode_GET_LENGTH(string), keepends);
10259 break;
10260 case PyUnicode_4BYTE_KIND:
10261 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 PyUnicode_GET_LENGTH(string), keepends);
10264 break;
10265 default:
10266 assert(0);
10267 list = 0;
10268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270}
10271
Alexander Belopolsky40018472011-02-26 01:02:56 +000010272static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010273split(PyObject *self,
10274 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010275 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010277 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 void *buf1, *buf2;
10279 Py_ssize_t len1, len2;
10280 PyObject* out;
10281
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010283 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 if (PyUnicode_READY(self) == -1)
10286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010289 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 if (PyUnicode_IS_ASCII(self))
10292 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010294 PyUnicode_GET_LENGTH(self), maxcount
10295 );
10296 else
10297 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 PyUnicode_GET_LENGTH(self), maxcount
10300 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 case PyUnicode_2BYTE_KIND:
10302 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010303 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 PyUnicode_GET_LENGTH(self), maxcount
10305 );
10306 case PyUnicode_4BYTE_KIND:
10307 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 PyUnicode_GET_LENGTH(self), maxcount
10310 );
10311 default:
10312 assert(0);
10313 return NULL;
10314 }
10315
10316 if (PyUnicode_READY(substring) == -1)
10317 return NULL;
10318
10319 kind1 = PyUnicode_KIND(self);
10320 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 len1 = PyUnicode_GET_LENGTH(self);
10322 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010323 if (kind1 < kind2 || len1 < len2) {
10324 out = PyList_New(1);
10325 if (out == NULL)
10326 return NULL;
10327 Py_INCREF(self);
10328 PyList_SET_ITEM(out, 0, self);
10329 return out;
10330 }
10331 buf1 = PyUnicode_DATA(self);
10332 buf2 = PyUnicode_DATA(substring);
10333 if (kind2 != kind1) {
10334 buf2 = _PyUnicode_AsKind(substring, kind1);
10335 if (!buf2)
10336 return NULL;
10337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010339 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10342 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 else
10345 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010346 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 break;
10348 case PyUnicode_2BYTE_KIND:
10349 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 break;
10352 case PyUnicode_4BYTE_KIND:
10353 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 break;
10356 default:
10357 out = NULL;
10358 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 PyMem_Free(buf2);
10361 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362}
10363
Alexander Belopolsky40018472011-02-26 01:02:56 +000010364static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010365rsplit(PyObject *self,
10366 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010367 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010368{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010369 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 void *buf1, *buf2;
10371 Py_ssize_t len1, len2;
10372 PyObject* out;
10373
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010374 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010375 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 if (PyUnicode_READY(self) == -1)
10378 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010381 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010383 if (PyUnicode_IS_ASCII(self))
10384 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010385 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386 PyUnicode_GET_LENGTH(self), maxcount
10387 );
10388 else
10389 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 PyUnicode_GET_LENGTH(self), maxcount
10392 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 case PyUnicode_2BYTE_KIND:
10394 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 PyUnicode_GET_LENGTH(self), maxcount
10397 );
10398 case PyUnicode_4BYTE_KIND:
10399 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 PyUnicode_GET_LENGTH(self), maxcount
10402 );
10403 default:
10404 assert(0);
10405 return NULL;
10406 }
10407
10408 if (PyUnicode_READY(substring) == -1)
10409 return NULL;
10410
10411 kind1 = PyUnicode_KIND(self);
10412 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 len1 = PyUnicode_GET_LENGTH(self);
10414 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010415 if (kind1 < kind2 || len1 < len2) {
10416 out = PyList_New(1);
10417 if (out == NULL)
10418 return NULL;
10419 Py_INCREF(self);
10420 PyList_SET_ITEM(out, 0, self);
10421 return out;
10422 }
10423 buf1 = PyUnicode_DATA(self);
10424 buf2 = PyUnicode_DATA(substring);
10425 if (kind2 != kind1) {
10426 buf2 = _PyUnicode_AsKind(substring, kind1);
10427 if (!buf2)
10428 return NULL;
10429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010431 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10434 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010435 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010436 else
10437 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010438 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 break;
10440 case PyUnicode_2BYTE_KIND:
10441 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010442 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 break;
10444 case PyUnicode_4BYTE_KIND:
10445 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010446 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 break;
10448 default:
10449 out = NULL;
10450 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010451 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 PyMem_Free(buf2);
10453 return out;
10454}
10455
10456static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010457anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10458 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010460 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010462 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10463 return asciilib_find(buf1, len1, buf2, len2, offset);
10464 else
10465 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 case PyUnicode_2BYTE_KIND:
10467 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10468 case PyUnicode_4BYTE_KIND:
10469 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10470 }
10471 assert(0);
10472 return -1;
10473}
10474
10475static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010476anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10477 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010479 switch (kind) {
10480 case PyUnicode_1BYTE_KIND:
10481 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10482 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10483 else
10484 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10485 case PyUnicode_2BYTE_KIND:
10486 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10487 case PyUnicode_4BYTE_KIND:
10488 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10489 }
10490 assert(0);
10491 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010492}
10493
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010494static void
10495replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10496 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10497{
10498 int kind = PyUnicode_KIND(u);
10499 void *data = PyUnicode_DATA(u);
10500 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10501 if (kind == PyUnicode_1BYTE_KIND) {
10502 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10503 (Py_UCS1 *)data + len,
10504 u1, u2, maxcount);
10505 }
10506 else if (kind == PyUnicode_2BYTE_KIND) {
10507 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10508 (Py_UCS2 *)data + len,
10509 u1, u2, maxcount);
10510 }
10511 else {
10512 assert(kind == PyUnicode_4BYTE_KIND);
10513 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10514 (Py_UCS4 *)data + len,
10515 u1, u2, maxcount);
10516 }
10517}
10518
Alexander Belopolsky40018472011-02-26 01:02:56 +000010519static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520replace(PyObject *self, PyObject *str1,
10521 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 PyObject *u;
10524 char *sbuf = PyUnicode_DATA(self);
10525 char *buf1 = PyUnicode_DATA(str1);
10526 char *buf2 = PyUnicode_DATA(str2);
10527 int srelease = 0, release1 = 0, release2 = 0;
10528 int skind = PyUnicode_KIND(self);
10529 int kind1 = PyUnicode_KIND(str1);
10530 int kind2 = PyUnicode_KIND(str2);
10531 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10532 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10533 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010534 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010535 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
10537 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010540 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541
Victor Stinner59de0ee2011-10-07 10:01:28 +020010542 if (str1 == str2)
10543 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544
Victor Stinner49a0a212011-10-12 23:46:10 +020010545 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010546 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10547 if (maxchar < maxchar_str1)
10548 /* substring too wide to be present */
10549 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010550 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10551 /* Replacing str1 with str2 may cause a maxchar reduction in the
10552 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010553 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010554 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010557 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010559 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010561 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010562 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010563 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010564
Victor Stinner69ed0f42013-04-09 21:48:24 +020010565 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010566 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010567 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010568 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010569 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010571 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010573
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010574 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10575 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 }
10577 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 int rkind = skind;
10579 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010580 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (kind1 < rkind) {
10583 /* widen substring */
10584 buf1 = _PyUnicode_AsKind(str1, rkind);
10585 if (!buf1) goto error;
10586 release1 = 1;
10587 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010589 if (i < 0)
10590 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 if (rkind > kind2) {
10592 /* widen replacement */
10593 buf2 = _PyUnicode_AsKind(str2, rkind);
10594 if (!buf2) goto error;
10595 release2 = 1;
10596 }
10597 else if (rkind < kind2) {
10598 /* widen self and buf1 */
10599 rkind = kind2;
10600 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010601 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 sbuf = _PyUnicode_AsKind(self, rkind);
10603 if (!sbuf) goto error;
10604 srelease = 1;
10605 buf1 = _PyUnicode_AsKind(str1, rkind);
10606 if (!buf1) goto error;
10607 release1 = 1;
10608 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010609 u = PyUnicode_New(slen, maxchar);
10610 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 assert(PyUnicode_KIND(u) == rkind);
10613 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010614
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010615 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010616 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010619 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010621
10622 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010623 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010624 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010625 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010626 if (i == -1)
10627 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010628 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010630 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010634 }
10635 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010637 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 int rkind = skind;
10639 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010642 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 buf1 = _PyUnicode_AsKind(str1, rkind);
10644 if (!buf1) goto error;
10645 release1 = 1;
10646 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010647 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 if (n == 0)
10649 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 buf2 = _PyUnicode_AsKind(str2, rkind);
10653 if (!buf2) goto error;
10654 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 rkind = kind2;
10659 sbuf = _PyUnicode_AsKind(self, rkind);
10660 if (!sbuf) goto error;
10661 srelease = 1;
10662 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010663 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 buf1 = _PyUnicode_AsKind(str1, rkind);
10665 if (!buf1) goto error;
10666 release1 = 1;
10667 }
10668 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10669 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010670 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 PyErr_SetString(PyExc_OverflowError,
10672 "replace string is too long");
10673 goto error;
10674 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010675 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010676 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010677 _Py_INCREF_UNICODE_EMPTY();
10678 if (!unicode_empty)
10679 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010680 u = unicode_empty;
10681 goto done;
10682 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010683 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 PyErr_SetString(PyExc_OverflowError,
10685 "replace string is too long");
10686 goto error;
10687 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010688 u = PyUnicode_New(new_size, maxchar);
10689 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010691 assert(PyUnicode_KIND(u) == rkind);
10692 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 ires = i = 0;
10694 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 while (n-- > 0) {
10696 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010697 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010698 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010699 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010700 if (j == -1)
10701 break;
10702 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010703 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010704 memcpy(res + rkind * ires,
10705 sbuf + rkind * i,
10706 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708 }
10709 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010711 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010713 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010715 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010719 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010720 memcpy(res + rkind * ires,
10721 sbuf + rkind * i,
10722 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010723 }
10724 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725 /* interleave */
10726 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010727 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010729 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010731 if (--n <= 0)
10732 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010733 memcpy(res + rkind * ires,
10734 sbuf + rkind * i,
10735 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 ires++;
10737 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010738 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010739 memcpy(res + rkind * ires,
10740 sbuf + rkind * i,
10741 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010742 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010743 }
10744
10745 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010746 unicode_adjust_maxchar(&u);
10747 if (u == NULL)
10748 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010750
10751 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 if (srelease)
10753 PyMem_FREE(sbuf);
10754 if (release1)
10755 PyMem_FREE(buf1);
10756 if (release2)
10757 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010758 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010760
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010762 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 if (srelease)
10764 PyMem_FREE(sbuf);
10765 if (release1)
10766 PyMem_FREE(buf1);
10767 if (release2)
10768 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010769 return unicode_result_unchanged(self);
10770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 error:
10772 if (srelease && sbuf)
10773 PyMem_FREE(sbuf);
10774 if (release1 && buf1)
10775 PyMem_FREE(buf1);
10776 if (release2 && buf2)
10777 PyMem_FREE(buf2);
10778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779}
10780
10781/* --- Unicode Object Methods --------------------------------------------- */
10782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010783PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785\n\
10786Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010787characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788
10789static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010790unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010792 if (PyUnicode_READY(self) == -1)
10793 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010794 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795}
10796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010797PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010798 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799\n\
10800Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010801have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802
10803static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010804unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010806 if (PyUnicode_READY(self) == -1)
10807 return NULL;
10808 if (PyUnicode_GET_LENGTH(self) == 0)
10809 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010810 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811}
10812
Benjamin Petersond5890c82012-01-14 13:23:30 -050010813PyDoc_STRVAR(casefold__doc__,
10814 "S.casefold() -> str\n\
10815\n\
10816Return a version of S suitable for caseless comparisons.");
10817
10818static PyObject *
10819unicode_casefold(PyObject *self)
10820{
10821 if (PyUnicode_READY(self) == -1)
10822 return NULL;
10823 if (PyUnicode_IS_ASCII(self))
10824 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010825 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010826}
10827
10828
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010829/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010830
10831static int
10832convert_uc(PyObject *obj, void *addr)
10833{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010835
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010836 if (!PyUnicode_Check(obj)) {
10837 PyErr_Format(PyExc_TypeError,
10838 "The fill character must be a unicode character, "
10839 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010840 return 0;
10841 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010842 if (PyUnicode_READY(obj) < 0)
10843 return 0;
10844 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010845 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010846 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010847 return 0;
10848 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010849 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010850 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010851}
10852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010853PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010854 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010856Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010857done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858
10859static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010860unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010862 Py_ssize_t marg, left;
10863 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 Py_UCS4 fillchar = ' ';
10865
Victor Stinnere9a29352011-10-01 02:14:59 +020010866 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
Benjamin Petersonbac79492012-01-14 13:34:47 -050010869 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870 return NULL;
10871
Victor Stinnerc4b49542011-12-11 22:44:26 +010010872 if (PyUnicode_GET_LENGTH(self) >= width)
10873 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874
Victor Stinnerc4b49542011-12-11 22:44:26 +010010875 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876 left = marg / 2 + (marg & width & 1);
10877
Victor Stinner9310abb2011-10-05 00:59:23 +020010878 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879}
10880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881/* This function assumes that str1 and str2 are readied by the caller. */
10882
Marc-André Lemburge5034372000-08-08 08:04:29 +000010883static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010884unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010885{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010886#define COMPARE(TYPE1, TYPE2) \
10887 do { \
10888 TYPE1* p1 = (TYPE1 *)data1; \
10889 TYPE2* p2 = (TYPE2 *)data2; \
10890 TYPE1* end = p1 + len; \
10891 Py_UCS4 c1, c2; \
10892 for (; p1 != end; p1++, p2++) { \
10893 c1 = *p1; \
10894 c2 = *p2; \
10895 if (c1 != c2) \
10896 return (c1 < c2) ? -1 : 1; \
10897 } \
10898 } \
10899 while (0)
10900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 int kind1, kind2;
10902 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010903 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 kind1 = PyUnicode_KIND(str1);
10906 kind2 = PyUnicode_KIND(str2);
10907 data1 = PyUnicode_DATA(str1);
10908 data2 = PyUnicode_DATA(str2);
10909 len1 = PyUnicode_GET_LENGTH(str1);
10910 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010911 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010912
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010913 switch(kind1) {
10914 case PyUnicode_1BYTE_KIND:
10915 {
10916 switch(kind2) {
10917 case PyUnicode_1BYTE_KIND:
10918 {
10919 int cmp = memcmp(data1, data2, len);
10920 /* normalize result of memcmp() into the range [-1; 1] */
10921 if (cmp < 0)
10922 return -1;
10923 if (cmp > 0)
10924 return 1;
10925 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010926 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010927 case PyUnicode_2BYTE_KIND:
10928 COMPARE(Py_UCS1, Py_UCS2);
10929 break;
10930 case PyUnicode_4BYTE_KIND:
10931 COMPARE(Py_UCS1, Py_UCS4);
10932 break;
10933 default:
10934 assert(0);
10935 }
10936 break;
10937 }
10938 case PyUnicode_2BYTE_KIND:
10939 {
10940 switch(kind2) {
10941 case PyUnicode_1BYTE_KIND:
10942 COMPARE(Py_UCS2, Py_UCS1);
10943 break;
10944 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010945 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010946 COMPARE(Py_UCS2, Py_UCS2);
10947 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010948 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010949 case PyUnicode_4BYTE_KIND:
10950 COMPARE(Py_UCS2, Py_UCS4);
10951 break;
10952 default:
10953 assert(0);
10954 }
10955 break;
10956 }
10957 case PyUnicode_4BYTE_KIND:
10958 {
10959 switch(kind2) {
10960 case PyUnicode_1BYTE_KIND:
10961 COMPARE(Py_UCS4, Py_UCS1);
10962 break;
10963 case PyUnicode_2BYTE_KIND:
10964 COMPARE(Py_UCS4, Py_UCS2);
10965 break;
10966 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010967 {
10968#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10969 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10970 /* normalize result of wmemcmp() into the range [-1; 1] */
10971 if (cmp < 0)
10972 return -1;
10973 if (cmp > 0)
10974 return 1;
10975#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010976 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010977#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010978 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010979 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010980 default:
10981 assert(0);
10982 }
10983 break;
10984 }
10985 default:
10986 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010987 }
10988
Victor Stinner770e19e2012-10-04 22:59:45 +020010989 if (len1 == len2)
10990 return 0;
10991 if (len1 < len2)
10992 return -1;
10993 else
10994 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010995
10996#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010997}
10998
Benjamin Peterson621b4302016-09-09 13:54:34 -070010999static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011000unicode_compare_eq(PyObject *str1, PyObject *str2)
11001{
11002 int kind;
11003 void *data1, *data2;
11004 Py_ssize_t len;
11005 int cmp;
11006
Victor Stinnere5567ad2012-10-23 02:48:49 +020011007 len = PyUnicode_GET_LENGTH(str1);
11008 if (PyUnicode_GET_LENGTH(str2) != len)
11009 return 0;
11010 kind = PyUnicode_KIND(str1);
11011 if (PyUnicode_KIND(str2) != kind)
11012 return 0;
11013 data1 = PyUnicode_DATA(str1);
11014 data2 = PyUnicode_DATA(str2);
11015
11016 cmp = memcmp(data1, data2, len * kind);
11017 return (cmp == 0);
11018}
11019
11020
Alexander Belopolsky40018472011-02-26 01:02:56 +000011021int
11022PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11025 if (PyUnicode_READY(left) == -1 ||
11026 PyUnicode_READY(right) == -1)
11027 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011028
11029 /* a string is equal to itself */
11030 if (left == right)
11031 return 0;
11032
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011033 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011035 PyErr_Format(PyExc_TypeError,
11036 "Can't compare %.100s and %.100s",
11037 left->ob_type->tp_name,
11038 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 return -1;
11040}
11041
Martin v. Löwis5b222132007-06-10 09:51:05 +000011042int
11043PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 Py_ssize_t i;
11046 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011048 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049
Victor Stinner910337b2011-10-03 03:20:16 +020011050 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011051 if (!PyUnicode_IS_READY(uni)) {
11052 const wchar_t *ws = _PyUnicode_WSTR(uni);
11053 /* Compare Unicode string and source character set string */
11054 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11055 if (chr != ustr[i])
11056 return (chr < ustr[i]) ? -1 : 1;
11057 }
11058 /* This check keeps Python strings that end in '\0' from comparing equal
11059 to C strings identical up to that point. */
11060 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11061 return 1; /* uni is longer */
11062 if (ustr[i])
11063 return -1; /* str is longer */
11064 return 0;
11065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011067 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011068 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011069 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011070 size_t len, len2 = strlen(str);
11071 int cmp;
11072
11073 len = Py_MIN(len1, len2);
11074 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011075 if (cmp != 0) {
11076 if (cmp < 0)
11077 return -1;
11078 else
11079 return 1;
11080 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011081 if (len1 > len2)
11082 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011083 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011084 return -1; /* str is longer */
11085 return 0;
11086 }
11087 else {
11088 void *data = PyUnicode_DATA(uni);
11089 /* Compare Unicode string and source character set string */
11090 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011091 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011092 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11093 /* This check keeps Python strings that end in '\0' from comparing equal
11094 to C strings identical up to that point. */
11095 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11096 return 1; /* uni is longer */
11097 if (str[i])
11098 return -1; /* str is longer */
11099 return 0;
11100 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011101}
11102
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011103static int
11104non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11105{
11106 size_t i, len;
11107 const wchar_t *p;
11108 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11109 if (strlen(str) != len)
11110 return 0;
11111 p = _PyUnicode_WSTR(unicode);
11112 assert(p);
11113 for (i = 0; i < len; i++) {
11114 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011115 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011116 return 0;
11117 }
11118 return 1;
11119}
11120
11121int
11122_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11123{
11124 size_t len;
11125 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011126 assert(str);
11127#ifndef NDEBUG
11128 for (const char *p = str; *p; p++) {
11129 assert((unsigned char)*p < 128);
11130 }
11131#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011132 if (PyUnicode_READY(unicode) == -1) {
11133 /* Memory error or bad data */
11134 PyErr_Clear();
11135 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11136 }
11137 if (!PyUnicode_IS_ASCII(unicode))
11138 return 0;
11139 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11140 return strlen(str) == len &&
11141 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11142}
11143
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011144int
11145_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11146{
11147 PyObject *right_uni;
11148 Py_hash_t hash;
11149
11150 assert(_PyUnicode_CHECK(left));
11151 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011152#ifndef NDEBUG
11153 for (const char *p = right->string; *p; p++) {
11154 assert((unsigned char)*p < 128);
11155 }
11156#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011157
11158 if (PyUnicode_READY(left) == -1) {
11159 /* memory error or bad data */
11160 PyErr_Clear();
11161 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11162 }
11163
11164 if (!PyUnicode_IS_ASCII(left))
11165 return 0;
11166
11167 right_uni = _PyUnicode_FromId(right); /* borrowed */
11168 if (right_uni == NULL) {
11169 /* memory error or bad data */
11170 PyErr_Clear();
11171 return _PyUnicode_EqualToASCIIString(left, right->string);
11172 }
11173
11174 if (left == right_uni)
11175 return 1;
11176
11177 if (PyUnicode_CHECK_INTERNED(left))
11178 return 0;
11179
11180 assert(_PyUnicode_HASH(right_uni) != 1);
11181 hash = _PyUnicode_HASH(left);
11182 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11183 return 0;
11184
11185 return unicode_compare_eq(left, right_uni);
11186}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011187
Benjamin Peterson29060642009-01-31 22:14:21 +000011188#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011189 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011190
Alexander Belopolsky40018472011-02-26 01:02:56 +000011191PyObject *
11192PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011193{
11194 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011195 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011196
Victor Stinnere5567ad2012-10-23 02:48:49 +020011197 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11198 Py_RETURN_NOTIMPLEMENTED;
11199
11200 if (PyUnicode_READY(left) == -1 ||
11201 PyUnicode_READY(right) == -1)
11202 return NULL;
11203
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011204 if (left == right) {
11205 switch (op) {
11206 case Py_EQ:
11207 case Py_LE:
11208 case Py_GE:
11209 /* a string is equal to itself */
11210 v = Py_True;
11211 break;
11212 case Py_NE:
11213 case Py_LT:
11214 case Py_GT:
11215 v = Py_False;
11216 break;
11217 default:
11218 PyErr_BadArgument();
11219 return NULL;
11220 }
11221 }
11222 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011223 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011224 result ^= (op == Py_NE);
11225 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011226 }
11227 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011228 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011229
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011230 /* Convert the return value to a Boolean */
11231 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011232 case Py_LE:
11233 v = TEST_COND(result <= 0);
11234 break;
11235 case Py_GE:
11236 v = TEST_COND(result >= 0);
11237 break;
11238 case Py_LT:
11239 v = TEST_COND(result == -1);
11240 break;
11241 case Py_GT:
11242 v = TEST_COND(result == 1);
11243 break;
11244 default:
11245 PyErr_BadArgument();
11246 return NULL;
11247 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011248 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011249 Py_INCREF(v);
11250 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011251}
11252
Alexander Belopolsky40018472011-02-26 01:02:56 +000011253int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011254_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11255{
11256 return unicode_eq(aa, bb);
11257}
11258
11259int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011260PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011261{
Victor Stinner77282cb2013-04-14 19:22:47 +020011262 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 void *buf1, *buf2;
11264 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011265 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011266
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011267 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011269 "'in <string>' requires string as left operand, not %.100s",
11270 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011271 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011272 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011273 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011274 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011275 if (ensure_unicode(str) < 0)
11276 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011279 kind2 = PyUnicode_KIND(substr);
11280 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011281 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011283 len2 = PyUnicode_GET_LENGTH(substr);
11284 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011285 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011286 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011287 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011288 if (len2 == 1) {
11289 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11290 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011291 return result;
11292 }
11293 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 buf2 = _PyUnicode_AsKind(substr, kind1);
11295 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011296 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298
Victor Stinner77282cb2013-04-14 19:22:47 +020011299 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 case PyUnicode_1BYTE_KIND:
11301 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11302 break;
11303 case PyUnicode_2BYTE_KIND:
11304 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11305 break;
11306 case PyUnicode_4BYTE_KIND:
11307 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11308 break;
11309 default:
11310 result = -1;
11311 assert(0);
11312 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011313
Victor Stinner77282cb2013-04-14 19:22:47 +020011314 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 PyMem_Free(buf2);
11316
Guido van Rossum403d68b2000-03-13 15:55:09 +000011317 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011318}
11319
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320/* Concat to string or Unicode object giving a new Unicode object. */
11321
Alexander Belopolsky40018472011-02-26 01:02:56 +000011322PyObject *
11323PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011325 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011326 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011327 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011329 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11330 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
11332 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011333 if (left == unicode_empty)
11334 return PyUnicode_FromObject(right);
11335 if (right == unicode_empty)
11336 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011338 left_len = PyUnicode_GET_LENGTH(left);
11339 right_len = PyUnicode_GET_LENGTH(right);
11340 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011341 PyErr_SetString(PyExc_OverflowError,
11342 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011343 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011344 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011345 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011346
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011347 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11348 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011349 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011352 result = PyUnicode_New(new_len, maxchar);
11353 if (result == NULL)
11354 return NULL;
11355 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11356 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11357 assert(_PyUnicode_CheckConsistency(result, 1));
11358 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359}
11360
Walter Dörwald1ab83302007-05-18 17:15:44 +000011361void
Victor Stinner23e56682011-10-03 03:54:37 +020011362PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011363{
Victor Stinner23e56682011-10-03 03:54:37 +020011364 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011365 Py_UCS4 maxchar, maxchar2;
11366 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011367
11368 if (p_left == NULL) {
11369 if (!PyErr_Occurred())
11370 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011371 return;
11372 }
Victor Stinner23e56682011-10-03 03:54:37 +020011373 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011374 if (right == NULL || left == NULL
11375 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011376 if (!PyErr_Occurred())
11377 PyErr_BadInternalCall();
11378 goto error;
11379 }
11380
Benjamin Petersonbac79492012-01-14 13:34:47 -050011381 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011382 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011383 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011384 goto error;
11385
Victor Stinner488fa492011-12-12 00:01:39 +010011386 /* Shortcuts */
11387 if (left == unicode_empty) {
11388 Py_DECREF(left);
11389 Py_INCREF(right);
11390 *p_left = right;
11391 return;
11392 }
11393 if (right == unicode_empty)
11394 return;
11395
11396 left_len = PyUnicode_GET_LENGTH(left);
11397 right_len = PyUnicode_GET_LENGTH(right);
11398 if (left_len > PY_SSIZE_T_MAX - right_len) {
11399 PyErr_SetString(PyExc_OverflowError,
11400 "strings are too large to concat");
11401 goto error;
11402 }
11403 new_len = left_len + right_len;
11404
11405 if (unicode_modifiable(left)
11406 && PyUnicode_CheckExact(right)
11407 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011408 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11409 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011410 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011411 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011412 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11413 {
11414 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011415 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011416 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011417
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011418 /* copy 'right' into the newly allocated area of 'left' */
11419 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011420 }
Victor Stinner488fa492011-12-12 00:01:39 +010011421 else {
11422 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11423 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011424 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011425
Victor Stinner488fa492011-12-12 00:01:39 +010011426 /* Concat the two Unicode strings */
11427 res = PyUnicode_New(new_len, maxchar);
11428 if (res == NULL)
11429 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011430 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11431 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011432 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011433 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011434 }
11435 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011436 return;
11437
11438error:
Victor Stinner488fa492011-12-12 00:01:39 +010011439 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011440}
11441
11442void
11443PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11444{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011445 PyUnicode_Append(pleft, right);
11446 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011447}
11448
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011449/*
11450Wraps stringlib_parse_args_finds() and additionally ensures that the
11451first argument is a unicode object.
11452*/
11453
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011454static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011455parse_args_finds_unicode(const char * function_name, PyObject *args,
11456 PyObject **substring,
11457 Py_ssize_t *start, Py_ssize_t *end)
11458{
11459 if(stringlib_parse_args_finds(function_name, args, substring,
11460 start, end)) {
11461 if (ensure_unicode(*substring) < 0)
11462 return 0;
11463 return 1;
11464 }
11465 return 0;
11466}
11467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011468PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011471Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011472string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
11475static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011476unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011478 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011479 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011480 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011482 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 void *buf1, *buf2;
11484 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011486 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 kind1 = PyUnicode_KIND(self);
11490 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011491 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011492 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 len1 = PyUnicode_GET_LENGTH(self);
11495 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011497 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011498 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011499
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011500 buf1 = PyUnicode_DATA(self);
11501 buf2 = PyUnicode_DATA(substring);
11502 if (kind2 != kind1) {
11503 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011504 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011505 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011506 }
11507 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 case PyUnicode_1BYTE_KIND:
11509 iresult = ucs1lib_count(
11510 ((Py_UCS1*)buf1) + start, end - start,
11511 buf2, len2, PY_SSIZE_T_MAX
11512 );
11513 break;
11514 case PyUnicode_2BYTE_KIND:
11515 iresult = ucs2lib_count(
11516 ((Py_UCS2*)buf1) + start, end - start,
11517 buf2, len2, PY_SSIZE_T_MAX
11518 );
11519 break;
11520 case PyUnicode_4BYTE_KIND:
11521 iresult = ucs4lib_count(
11522 ((Py_UCS4*)buf1) + start, end - start,
11523 buf2, len2, PY_SSIZE_T_MAX
11524 );
11525 break;
11526 default:
11527 assert(0); iresult = 0;
11528 }
11529
11530 result = PyLong_FromSsize_t(iresult);
11531
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011532 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 return result;
11536}
11537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011538PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011539 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011541Encode S using the codec registered for encoding. Default encoding\n\
11542is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011543handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011544a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11545'xmlcharrefreplace' as well as any other name registered with\n\
11546codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
11548static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011549unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011551 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 char *encoding = NULL;
11553 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011554
Benjamin Peterson308d6372009-09-18 21:42:35 +000011555 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11556 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011558 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011559}
11560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011561PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011562 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563\n\
11564Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011565If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566
11567static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011568unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011570 Py_ssize_t i, j, line_pos, src_len, incr;
11571 Py_UCS4 ch;
11572 PyObject *u;
11573 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011574 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011577 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578
Ezio Melotti745d54d2013-11-16 19:10:57 +020011579 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11580 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
Antoine Pitrou22425222011-10-04 19:10:51 +020011583 if (PyUnicode_READY(self) == -1)
11584 return NULL;
11585
Thomas Wouters7e474022000-07-16 12:04:32 +000011586 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011587 src_len = PyUnicode_GET_LENGTH(self);
11588 i = j = line_pos = 0;
11589 kind = PyUnicode_KIND(self);
11590 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011591 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011592 for (; i < src_len; i++) {
11593 ch = PyUnicode_READ(kind, src_data, i);
11594 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011595 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 goto overflow;
11600 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011602 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011606 goto overflow;
11607 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011609 if (ch == '\n' || ch == '\r')
11610 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011612 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011613 if (!found)
11614 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011615
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011617 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618 if (!u)
11619 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011620 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Antoine Pitroue71d5742011-10-04 15:55:09 +020011622 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Antoine Pitroue71d5742011-10-04 15:55:09 +020011624 for (; i < src_len; i++) {
11625 ch = PyUnicode_READ(kind, src_data, i);
11626 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011628 incr = tabsize - (line_pos % tabsize);
11629 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011630 FILL(kind, dest_data, ' ', j, incr);
11631 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011633 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011635 line_pos++;
11636 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011637 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011638 if (ch == '\n' || ch == '\r')
11639 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011641 }
11642 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011643 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011644
Antoine Pitroue71d5742011-10-04 15:55:09 +020011645 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011646 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648}
11649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652\n\
11653Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011654such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655arguments start and end are interpreted as in slice notation.\n\
11656\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011657Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
11659static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011662 /* initialize variables to prevent gcc warning */
11663 PyObject *substring = NULL;
11664 Py_ssize_t start = 0;
11665 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011666 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011668 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011671 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011674 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if (result == -2)
11677 return NULL;
11678
Christian Heimes217cfd12007-12-02 14:31:20 +000011679 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680}
11681
11682static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011683unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011685 void *data;
11686 enum PyUnicode_Kind kind;
11687 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011688
11689 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11690 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011692 }
11693 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11694 PyErr_SetString(PyExc_IndexError, "string index out of range");
11695 return NULL;
11696 }
11697 kind = PyUnicode_KIND(self);
11698 data = PyUnicode_DATA(self);
11699 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011700 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701}
11702
Guido van Rossumc2504932007-09-18 19:42:40 +000011703/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011704 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011705static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011706unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707{
Guido van Rossumc2504932007-09-18 19:42:40 +000011708 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011709 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011710
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011711#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011712 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011713#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (_PyUnicode_HASH(self) != -1)
11715 return _PyUnicode_HASH(self);
11716 if (PyUnicode_READY(self) == -1)
11717 return -1;
11718 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011719 /*
11720 We make the hash of the empty string be 0, rather than using
11721 (prefix ^ suffix), since this slightly obfuscates the hash secret
11722 */
11723 if (len == 0) {
11724 _PyUnicode_HASH(self) = 0;
11725 return 0;
11726 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011727 x = _Py_HashBytes(PyUnicode_DATA(self),
11728 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011730 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731}
11732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011733PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735\n\
Mariatta577fc042017-04-09 15:17:06 -070011736Return the lowest index in S where substring sub is found, \n\
11737such that sub is contained within S[start:end]. Optional\n\
11738arguments start and end are interpreted as in slice notation.\n\
11739\n\
11740Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
11742static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011745 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011746 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011747 PyObject *substring = NULL;
11748 Py_ssize_t start = 0;
11749 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011751 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011754 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011757 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 if (result == -2)
11760 return NULL;
11761
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 if (result < 0) {
11763 PyErr_SetString(PyExc_ValueError, "substring not found");
11764 return NULL;
11765 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011766
Christian Heimes217cfd12007-12-02 14:31:20 +000011767 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768}
11769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011773Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011774at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
11776static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011777unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 Py_ssize_t i, length;
11780 int kind;
11781 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 int cased;
11783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 if (PyUnicode_READY(self) == -1)
11785 return NULL;
11786 length = PyUnicode_GET_LENGTH(self);
11787 kind = PyUnicode_KIND(self);
11788 data = PyUnicode_DATA(self);
11789
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (length == 1)
11792 return PyBool_FromLong(
11793 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011795 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011798
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 for (i = 0; i < length; i++) {
11801 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011802
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11804 return PyBool_FromLong(0);
11805 else if (!cased && Py_UNICODE_ISLOWER(ch))
11806 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011808 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809}
11810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011811PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011814Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816
11817static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011818unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 Py_ssize_t i, length;
11821 int kind;
11822 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 int cased;
11824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 if (PyUnicode_READY(self) == -1)
11826 return NULL;
11827 length = PyUnicode_GET_LENGTH(self);
11828 kind = PyUnicode_KIND(self);
11829 data = PyUnicode_DATA(self);
11830
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 if (length == 1)
11833 return PyBool_FromLong(
11834 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011836 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011839
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 for (i = 0; i < length; i++) {
11842 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011843
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11845 return PyBool_FromLong(0);
11846 else if (!cased && Py_UNICODE_ISUPPER(ch))
11847 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011849 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850}
11851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011852PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011855Return True if S is a titlecased string and there is at least one\n\
11856character in S, i.e. upper- and titlecase characters may only\n\
11857follow uncased characters and lowercase characters only cased ones.\n\
11858Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859
11860static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011861unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 Py_ssize_t i, length;
11864 int kind;
11865 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 int cased, previous_is_cased;
11867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 if (PyUnicode_READY(self) == -1)
11869 return NULL;
11870 length = PyUnicode_GET_LENGTH(self);
11871 kind = PyUnicode_KIND(self);
11872 data = PyUnicode_DATA(self);
11873
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if (length == 1) {
11876 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11877 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11878 (Py_UNICODE_ISUPPER(ch) != 0));
11879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011881 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011884
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 cased = 0;
11886 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 for (i = 0; i < length; i++) {
11888 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011889
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11891 if (previous_is_cased)
11892 return PyBool_FromLong(0);
11893 previous_is_cased = 1;
11894 cased = 1;
11895 }
11896 else if (Py_UNICODE_ISLOWER(ch)) {
11897 if (!previous_is_cased)
11898 return PyBool_FromLong(0);
11899 previous_is_cased = 1;
11900 cased = 1;
11901 }
11902 else
11903 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011905 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011908PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011911Return True if all characters in S are whitespace\n\
11912and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
11914static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011915unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 Py_ssize_t i, length;
11918 int kind;
11919 void *data;
11920
11921 if (PyUnicode_READY(self) == -1)
11922 return NULL;
11923 length = PyUnicode_GET_LENGTH(self);
11924 kind = PyUnicode_KIND(self);
11925 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 1)
11929 return PyBool_FromLong(
11930 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 for (i = 0; i < length; i++) {
11937 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011938 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011941 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942}
11943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011944PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011946\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011947Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011948and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949
11950static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011951unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 Py_ssize_t i, length;
11954 int kind;
11955 void *data;
11956
11957 if (PyUnicode_READY(self) == -1)
11958 return NULL;
11959 length = PyUnicode_GET_LENGTH(self);
11960 kind = PyUnicode_KIND(self);
11961 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011962
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011963 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 if (length == 1)
11965 return PyBool_FromLong(
11966 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011967
11968 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 for (i = 0; i < length; i++) {
11973 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011975 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011976 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011977}
11978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011979PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011981\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011982Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011983and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011984
11985static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011986unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 int kind;
11989 void *data;
11990 Py_ssize_t len, i;
11991
11992 if (PyUnicode_READY(self) == -1)
11993 return NULL;
11994
11995 kind = PyUnicode_KIND(self);
11996 data = PyUnicode_DATA(self);
11997 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011998
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011999 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 if (len == 1) {
12001 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12002 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12003 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012004
12005 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 for (i = 0; i < len; i++) {
12010 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012011 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012013 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012014 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012015}
12016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012017PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000012020Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012021False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
12023static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012024unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 Py_ssize_t i, length;
12027 int kind;
12028 void *data;
12029
12030 if (PyUnicode_READY(self) == -1)
12031 return NULL;
12032 length = PyUnicode_GET_LENGTH(self);
12033 kind = PyUnicode_KIND(self);
12034 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (length == 1)
12038 return PyBool_FromLong(
12039 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012041 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 for (i = 0; i < length; i++) {
12046 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012049 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050}
12051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012052PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000012055Return True if all characters in S are digits\n\
12056and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057
12058static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012059unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 Py_ssize_t i, length;
12062 int kind;
12063 void *data;
12064
12065 if (PyUnicode_READY(self) == -1)
12066 return NULL;
12067 length = PyUnicode_GET_LENGTH(self);
12068 kind = PyUnicode_KIND(self);
12069 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 if (length == 1) {
12073 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12074 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012077 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 for (i = 0; i < length; i++) {
12082 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012085 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086}
12087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012088PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000012091Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012092False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
12094static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012095unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 Py_ssize_t i, length;
12098 int kind;
12099 void *data;
12100
12101 if (PyUnicode_READY(self) == -1)
12102 return NULL;
12103 length = PyUnicode_GET_LENGTH(self);
12104 kind = PyUnicode_KIND(self);
12105 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (length == 1)
12109 return PyBool_FromLong(
12110 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012112 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 for (i = 0; i < length; i++) {
12117 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012118 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012120 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121}
12122
Martin v. Löwis47383402007-08-15 07:32:56 +000012123int
12124PyUnicode_IsIdentifier(PyObject *self)
12125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 int kind;
12127 void *data;
12128 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012129 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 if (PyUnicode_READY(self) == -1) {
12132 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 }
12135
12136 /* Special case for empty strings */
12137 if (PyUnicode_GET_LENGTH(self) == 0)
12138 return 0;
12139 kind = PyUnicode_KIND(self);
12140 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012141
12142 /* PEP 3131 says that the first character must be in
12143 XID_Start and subsequent characters in XID_Continue,
12144 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012145 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012146 letters, digits, underscore). However, given the current
12147 definition of XID_Start and XID_Continue, it is sufficient
12148 to check just for these, except that _ must be allowed
12149 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012151 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012152 return 0;
12153
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012154 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012157 return 1;
12158}
12159
12160PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012161 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012162\n\
12163Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012164to the language definition.\n\
12165\n\
12166Use keyword.iskeyword() to test for reserved identifiers\n\
12167such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012168
12169static PyObject*
12170unicode_isidentifier(PyObject *self)
12171{
12172 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12173}
12174
Georg Brandl559e5d72008-06-11 18:37:52 +000012175PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012177\n\
12178Return True if all characters in S are considered\n\
12179printable in repr() or S is empty, False otherwise.");
12180
12181static PyObject*
12182unicode_isprintable(PyObject *self)
12183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 Py_ssize_t i, length;
12185 int kind;
12186 void *data;
12187
12188 if (PyUnicode_READY(self) == -1)
12189 return NULL;
12190 length = PyUnicode_GET_LENGTH(self);
12191 kind = PyUnicode_KIND(self);
12192 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012193
12194 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 if (length == 1)
12196 return PyBool_FromLong(
12197 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 for (i = 0; i < length; i++) {
12200 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012201 Py_RETURN_FALSE;
12202 }
12203 }
12204 Py_RETURN_TRUE;
12205}
12206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012207PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012208 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209\n\
12210Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012211iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
12213static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012214unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012216 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217}
12218
Martin v. Löwis18e16552006-02-15 17:27:45 +000012219static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012220unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 if (PyUnicode_READY(self) == -1)
12223 return -1;
12224 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225}
12226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012227PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012230Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012231done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232
12233static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012234unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012236 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 Py_UCS4 fillchar = ' ';
12238
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012239 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 return NULL;
12241
Benjamin Petersonbac79492012-01-14 13:34:47 -050012242 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
Victor Stinnerc4b49542011-12-11 22:44:26 +010012245 if (PyUnicode_GET_LENGTH(self) >= width)
12246 return unicode_result_unchanged(self);
12247
12248 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249}
12250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012251PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012254Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255
12256static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012257unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012259 if (PyUnicode_READY(self) == -1)
12260 return NULL;
12261 if (PyUnicode_IS_ASCII(self))
12262 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012263 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264}
12265
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266#define LEFTSTRIP 0
12267#define RIGHTSTRIP 1
12268#define BOTHSTRIP 2
12269
12270/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012271static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012272
12273#define STRIPNAME(i) (stripformat[i]+3)
12274
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275/* externally visible for str.strip(unicode) */
12276PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012277_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 void *data;
12280 int kind;
12281 Py_ssize_t i, j, len;
12282 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012283 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12286 return NULL;
12287
12288 kind = PyUnicode_KIND(self);
12289 data = PyUnicode_DATA(self);
12290 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012291 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12293 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012294 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295
Benjamin Peterson14339b62009-01-31 16:36:08 +000012296 i = 0;
12297 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012298 while (i < len) {
12299 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12300 if (!BLOOM(sepmask, ch))
12301 break;
12302 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12303 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 i++;
12305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307
Benjamin Peterson14339b62009-01-31 16:36:08 +000012308 j = len;
12309 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012310 j--;
12311 while (j >= i) {
12312 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12313 if (!BLOOM(sepmask, ch))
12314 break;
12315 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12316 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012318 }
12319
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012322
Victor Stinner7931d9a2011-11-04 00:22:48 +010012323 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324}
12325
12326PyObject*
12327PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12328{
12329 unsigned char *data;
12330 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012331 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332
Victor Stinnerde636f32011-10-01 03:55:54 +020012333 if (PyUnicode_READY(self) == -1)
12334 return NULL;
12335
Victor Stinner684d5fd2012-05-03 02:32:34 +020012336 length = PyUnicode_GET_LENGTH(self);
12337 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012338
Victor Stinner684d5fd2012-05-03 02:32:34 +020012339 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012340 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341
Victor Stinnerde636f32011-10-01 03:55:54 +020012342 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012343 PyErr_SetString(PyExc_IndexError, "string index out of range");
12344 return NULL;
12345 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012346 if (start >= length || end < start)
12347 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012348
Victor Stinner684d5fd2012-05-03 02:32:34 +020012349 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012350 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012351 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012352 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012353 }
12354 else {
12355 kind = PyUnicode_KIND(self);
12356 data = PyUnicode_1BYTE_DATA(self);
12357 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012358 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012359 length);
12360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
12363static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012364do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 Py_ssize_t len, i, j;
12367
12368 if (PyUnicode_READY(self) == -1)
12369 return NULL;
12370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012372
Victor Stinnercc7af722013-04-09 22:39:24 +020012373 if (PyUnicode_IS_ASCII(self)) {
12374 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12375
12376 i = 0;
12377 if (striptype != RIGHTSTRIP) {
12378 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012379 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012380 if (!_Py_ascii_whitespace[ch])
12381 break;
12382 i++;
12383 }
12384 }
12385
12386 j = len;
12387 if (striptype != LEFTSTRIP) {
12388 j--;
12389 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012390 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012391 if (!_Py_ascii_whitespace[ch])
12392 break;
12393 j--;
12394 }
12395 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012396 }
12397 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012398 else {
12399 int kind = PyUnicode_KIND(self);
12400 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012401
Victor Stinnercc7af722013-04-09 22:39:24 +020012402 i = 0;
12403 if (striptype != RIGHTSTRIP) {
12404 while (i < len) {
12405 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12406 if (!Py_UNICODE_ISSPACE(ch))
12407 break;
12408 i++;
12409 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012410 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012411
12412 j = len;
12413 if (striptype != LEFTSTRIP) {
12414 j--;
12415 while (j >= i) {
12416 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12417 if (!Py_UNICODE_ISSPACE(ch))
12418 break;
12419 j--;
12420 }
12421 j++;
12422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012424
Victor Stinner7931d9a2011-11-04 00:22:48 +010012425 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426}
12427
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012428
12429static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012430do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012431{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012432 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012433
Serhiy Storchakac6792272013-10-19 21:03:34 +030012434 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012436
Benjamin Peterson14339b62009-01-31 16:36:08 +000012437 if (sep != NULL && sep != Py_None) {
12438 if (PyUnicode_Check(sep))
12439 return _PyUnicode_XStrip(self, striptype, sep);
12440 else {
12441 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 "%s arg must be None or str",
12443 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444 return NULL;
12445 }
12446 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012447
Benjamin Peterson14339b62009-01-31 16:36:08 +000012448 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012449}
12450
12451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012452PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454\n\
12455Return a copy of the string S with leading and trailing\n\
12456whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012457If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458
12459static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012460unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 if (PyTuple_GET_SIZE(args) == 0)
12463 return do_strip(self, BOTHSTRIP); /* Common case */
12464 else
12465 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012466}
12467
12468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012469PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012471\n\
12472Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012473If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012474
12475static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012476unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012478 if (PyTuple_GET_SIZE(args) == 0)
12479 return do_strip(self, LEFTSTRIP); /* Common case */
12480 else
12481 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012482}
12483
12484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012485PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012487\n\
12488Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012489If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012490
12491static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012492unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012493{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012494 if (PyTuple_GET_SIZE(args) == 0)
12495 return do_strip(self, RIGHTSTRIP); /* Common case */
12496 else
12497 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498}
12499
12500
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012502unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012504 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506
Serhiy Storchaka05997252013-01-26 12:14:02 +020012507 if (len < 1)
12508 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
Victor Stinnerc4b49542011-12-11 22:44:26 +010012510 /* no repeat, return original string */
12511 if (len == 1)
12512 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012513
Benjamin Petersonbac79492012-01-14 13:34:47 -050012514 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 return NULL;
12516
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012517 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012518 PyErr_SetString(PyExc_OverflowError,
12519 "repeated string is too long");
12520 return NULL;
12521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012523
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012524 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 if (!u)
12526 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012527 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 if (PyUnicode_GET_LENGTH(str) == 1) {
12530 const int kind = PyUnicode_KIND(str);
12531 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012532 if (kind == PyUnicode_1BYTE_KIND) {
12533 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012534 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012535 }
12536 else if (kind == PyUnicode_2BYTE_KIND) {
12537 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012538 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012539 ucs2[n] = fill_char;
12540 } else {
12541 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12542 assert(kind == PyUnicode_4BYTE_KIND);
12543 for (n = 0; n < len; ++n)
12544 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 }
12547 else {
12548 /* number of characters copied this far */
12549 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012550 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012552 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012556 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012557 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559 }
12560
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012561 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012562 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563}
12564
Alexander Belopolsky40018472011-02-26 01:02:56 +000012565PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012566PyUnicode_Replace(PyObject *str,
12567 PyObject *substr,
12568 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012569 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012571 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12572 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012574 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575}
12576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012577PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012578 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579\n\
12580Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012581old replaced by new. If the optional argument count is\n\
12582given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
12584static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 PyObject *str1;
12588 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012589 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012591 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012593 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012595 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596}
12597
Alexander Belopolsky40018472011-02-26 01:02:56 +000012598static PyObject *
12599unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012601 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 Py_ssize_t isize;
12603 Py_ssize_t osize, squote, dquote, i, o;
12604 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012605 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012609 return NULL;
12610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 isize = PyUnicode_GET_LENGTH(unicode);
12612 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 /* Compute length of output, quote characters, and
12615 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012616 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 max = 127;
12618 squote = dquote = 0;
12619 ikind = PyUnicode_KIND(unicode);
12620 for (i = 0; i < isize; i++) {
12621 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012622 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012624 case '\'': squote++; break;
12625 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012627 incr = 2;
12628 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 default:
12630 /* Fast-path ASCII */
12631 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012632 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012634 ;
12635 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012638 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012640 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012642 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012644 if (osize > PY_SSIZE_T_MAX - incr) {
12645 PyErr_SetString(PyExc_OverflowError,
12646 "string is too long to generate repr");
12647 return NULL;
12648 }
12649 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 }
12651
12652 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012653 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012655 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 if (dquote)
12657 /* Both squote and dquote present. Use squote,
12658 and escape them */
12659 osize += squote;
12660 else
12661 quote = '"';
12662 }
Victor Stinner55c08782013-04-14 18:45:39 +020012663 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664
12665 repr = PyUnicode_New(osize, max);
12666 if (repr == NULL)
12667 return NULL;
12668 okind = PyUnicode_KIND(repr);
12669 odata = PyUnicode_DATA(repr);
12670
12671 PyUnicode_WRITE(okind, odata, 0, quote);
12672 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012673 if (unchanged) {
12674 _PyUnicode_FastCopyCharacters(repr, 1,
12675 unicode, 0,
12676 isize);
12677 }
12678 else {
12679 for (i = 0, o = 1; i < isize; i++) {
12680 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681
Victor Stinner55c08782013-04-14 18:45:39 +020012682 /* Escape quotes and backslashes */
12683 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012684 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012686 continue;
12687 }
12688
12689 /* Map special whitespace to '\t', \n', '\r' */
12690 if (ch == '\t') {
12691 PyUnicode_WRITE(okind, odata, o++, '\\');
12692 PyUnicode_WRITE(okind, odata, o++, 't');
12693 }
12694 else if (ch == '\n') {
12695 PyUnicode_WRITE(okind, odata, o++, '\\');
12696 PyUnicode_WRITE(okind, odata, o++, 'n');
12697 }
12698 else if (ch == '\r') {
12699 PyUnicode_WRITE(okind, odata, o++, '\\');
12700 PyUnicode_WRITE(okind, odata, o++, 'r');
12701 }
12702
12703 /* Map non-printable US ASCII to '\xhh' */
12704 else if (ch < ' ' || ch == 0x7F) {
12705 PyUnicode_WRITE(okind, odata, o++, '\\');
12706 PyUnicode_WRITE(okind, odata, o++, 'x');
12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12709 }
12710
12711 /* Copy ASCII characters as-is */
12712 else if (ch < 0x7F) {
12713 PyUnicode_WRITE(okind, odata, o++, ch);
12714 }
12715
12716 /* Non-ASCII characters */
12717 else {
12718 /* Map Unicode whitespace and control characters
12719 (categories Z* and C* except ASCII space)
12720 */
12721 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12722 PyUnicode_WRITE(okind, odata, o++, '\\');
12723 /* Map 8-bit characters to '\xhh' */
12724 if (ch <= 0xff) {
12725 PyUnicode_WRITE(okind, odata, o++, 'x');
12726 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12727 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12728 }
12729 /* Map 16-bit characters to '\uxxxx' */
12730 else if (ch <= 0xffff) {
12731 PyUnicode_WRITE(okind, odata, o++, 'u');
12732 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12733 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12734 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12735 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12736 }
12737 /* Map 21-bit characters to '\U00xxxxxx' */
12738 else {
12739 PyUnicode_WRITE(okind, odata, o++, 'U');
12740 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12748 }
12749 }
12750 /* Copy characters as-is */
12751 else {
12752 PyUnicode_WRITE(okind, odata, o++, ch);
12753 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012754 }
12755 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012758 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012759 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760}
12761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012762PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764\n\
12765Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012766such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767arguments start and end are interpreted as in slice notation.\n\
12768\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012769Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770
12771static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012774 /* initialize variables to prevent gcc warning */
12775 PyObject *substring = NULL;
12776 Py_ssize_t start = 0;
12777 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012778 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012780 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012783 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012786 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 if (result == -2)
12789 return NULL;
12790
Christian Heimes217cfd12007-12-02 14:31:20 +000012791 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792}
12793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012794PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012795 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796\n\
Mariatta577fc042017-04-09 15:17:06 -070012797Return the highest index in S where substring sub is found,\n\
12798such that sub is contained within S[start:end]. Optional\n\
12799arguments start and end are interpreted as in slice notation.\n\
12800\n\
12801Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802
12803static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012806 /* initialize variables to prevent gcc warning */
12807 PyObject *substring = NULL;
12808 Py_ssize_t start = 0;
12809 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012812 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012815 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012818 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 if (result == -2)
12821 return NULL;
12822
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823 if (result < 0) {
12824 PyErr_SetString(PyExc_ValueError, "substring not found");
12825 return NULL;
12826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827
Christian Heimes217cfd12007-12-02 14:31:20 +000012828 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829}
12830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012831PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012834Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012835done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836
12837static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012838unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012840 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 Py_UCS4 fillchar = ' ';
12842
Victor Stinnere9a29352011-10-01 02:14:59 +020012843 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012845
Benjamin Petersonbac79492012-01-14 13:34:47 -050012846 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847 return NULL;
12848
Victor Stinnerc4b49542011-12-11 22:44:26 +010012849 if (PyUnicode_GET_LENGTH(self) >= width)
12850 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851
Victor Stinnerc4b49542011-12-11 22:44:26 +010012852 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853}
12854
Alexander Belopolsky40018472011-02-26 01:02:56 +000012855PyObject *
12856PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012861 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012864PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012865 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866\n\
12867Return a list of the words in S, using sep as the\n\
12868delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012869splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012870whitespace string is a separator and empty strings are\n\
12871removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872
12873static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012874unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012876 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012878 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012880 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12881 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882 return NULL;
12883
12884 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012885 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012886
12887 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012888 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012889
12890 PyErr_Format(PyExc_TypeError,
12891 "must be str or None, not %.100s",
12892 Py_TYPE(substring)->tp_name);
12893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894}
12895
Thomas Wouters477c8d52006-05-27 19:21:47 +000012896PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012897PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012898{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012899 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012900 int kind1, kind2;
12901 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012903
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012904 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906
Victor Stinner14f8f022011-10-05 20:58:25 +020012907 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 len1 = PyUnicode_GET_LENGTH(str_obj);
12910 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012911 if (kind1 < kind2 || len1 < len2) {
12912 _Py_INCREF_UNICODE_EMPTY();
12913 if (!unicode_empty)
12914 out = NULL;
12915 else {
12916 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12917 Py_DECREF(unicode_empty);
12918 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012919 return out;
12920 }
12921 buf1 = PyUnicode_DATA(str_obj);
12922 buf2 = PyUnicode_DATA(sep_obj);
12923 if (kind2 != kind1) {
12924 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12925 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012926 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012929 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012931 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12932 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12933 else
12934 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 break;
12936 case PyUnicode_2BYTE_KIND:
12937 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12938 break;
12939 case PyUnicode_4BYTE_KIND:
12940 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941 break;
12942 default:
12943 assert(0);
12944 out = 0;
12945 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012946
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012947 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012949
12950 return out;
12951}
12952
12953
12954PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012955PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012956{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012957 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012958 int kind1, kind2;
12959 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012961
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012962 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012964
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012965 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 len1 = PyUnicode_GET_LENGTH(str_obj);
12968 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012969 if (kind1 < kind2 || len1 < len2) {
12970 _Py_INCREF_UNICODE_EMPTY();
12971 if (!unicode_empty)
12972 out = NULL;
12973 else {
12974 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12975 Py_DECREF(unicode_empty);
12976 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012977 return out;
12978 }
12979 buf1 = PyUnicode_DATA(str_obj);
12980 buf2 = PyUnicode_DATA(sep_obj);
12981 if (kind2 != kind1) {
12982 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12983 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012984 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012987 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012989 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12990 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12991 else
12992 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 break;
12994 case PyUnicode_2BYTE_KIND:
12995 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12996 break;
12997 case PyUnicode_4BYTE_KIND:
12998 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12999 break;
13000 default:
13001 assert(0);
13002 out = 0;
13003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013004
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013005 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013007
13008 return out;
13009}
13010
13011PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013013\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000013014Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013015the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000013016found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013017
13018static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013019unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013020{
Victor Stinner9310abb2011-10-05 00:59:23 +020013021 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013022}
13023
13024PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000013025 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013026\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000013027Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013028the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000013029separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013030
13031static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013032unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033{
Victor Stinner9310abb2011-10-05 00:59:23 +020013034 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035}
13036
Alexander Belopolsky40018472011-02-26 01:02:56 +000013037PyObject *
13038PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013039{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013040 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013041 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013042
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013043 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013044}
13045
13046PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013047 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013048\n\
13049Return a list of the words in S, using sep as the\n\
13050delimiter string, starting at the end of the string and\n\
13051working to the front. If maxsplit is given, at most maxsplit\n\
13052splits are done. If sep is not specified, any whitespace string\n\
13053is a separator.");
13054
13055static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013056unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013057{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013058 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013059 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013060 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013061
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013062 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13063 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013064 return NULL;
13065
13066 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013068
13069 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020013070 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013071
13072 PyErr_Format(PyExc_TypeError,
13073 "must be str or None, not %.100s",
13074 Py_TYPE(substring)->tp_name);
13075 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013076}
13077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013078PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080\n\
13081Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000013082Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013083is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084
13085static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013086unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013088 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000013089 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013091 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13092 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093 return NULL;
13094
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013095 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096}
13097
13098static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013099PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013101 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102}
13103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013104PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106\n\
13107Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013108and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109
13110static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013111unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013113 if (PyUnicode_READY(self) == -1)
13114 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013115 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116}
13117
Larry Hastings61272b72014-01-07 12:41:53 -080013118/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013119
Larry Hastings31826802013-10-19 00:09:25 -070013120@staticmethod
13121str.maketrans as unicode_maketrans
13122
13123 x: object
13124
13125 y: unicode=NULL
13126
13127 z: unicode=NULL
13128
13129 /
13130
13131Return a translation table usable for str.translate().
13132
13133If there is only one argument, it must be a dictionary mapping Unicode
13134ordinals (integers) or characters to Unicode ordinals, strings or None.
13135Character keys will be then converted to ordinals.
13136If there are two arguments, they must be strings of equal length, and
13137in the resulting dictionary, each character in x will be mapped to the
13138character at the same position in y. If there is a third argument, it
13139must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013140[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013141
Larry Hastings31826802013-10-19 00:09:25 -070013142static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013143unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013144/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013145{
Georg Brandlceee0772007-11-27 23:48:05 +000013146 PyObject *new = NULL, *key, *value;
13147 Py_ssize_t i = 0;
13148 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149
Georg Brandlceee0772007-11-27 23:48:05 +000013150 new = PyDict_New();
13151 if (!new)
13152 return NULL;
13153 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 int x_kind, y_kind, z_kind;
13155 void *x_data, *y_data, *z_data;
13156
Georg Brandlceee0772007-11-27 23:48:05 +000013157 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013158 if (!PyUnicode_Check(x)) {
13159 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13160 "be a string if there is a second argument");
13161 goto err;
13162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013164 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13165 "arguments must have equal length");
13166 goto err;
13167 }
13168 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 x_kind = PyUnicode_KIND(x);
13170 y_kind = PyUnicode_KIND(y);
13171 x_data = PyUnicode_DATA(x);
13172 y_data = PyUnicode_DATA(y);
13173 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13174 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013175 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013176 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013177 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013178 if (!value) {
13179 Py_DECREF(key);
13180 goto err;
13181 }
Georg Brandlceee0772007-11-27 23:48:05 +000013182 res = PyDict_SetItem(new, key, value);
13183 Py_DECREF(key);
13184 Py_DECREF(value);
13185 if (res < 0)
13186 goto err;
13187 }
13188 /* create entries for deleting chars in z */
13189 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 z_kind = PyUnicode_KIND(z);
13191 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013192 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013194 if (!key)
13195 goto err;
13196 res = PyDict_SetItem(new, key, Py_None);
13197 Py_DECREF(key);
13198 if (res < 0)
13199 goto err;
13200 }
13201 }
13202 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 int kind;
13204 void *data;
13205
Georg Brandlceee0772007-11-27 23:48:05 +000013206 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013207 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013208 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13209 "to maketrans it must be a dict");
13210 goto err;
13211 }
13212 /* copy entries into the new dict, converting string keys to int keys */
13213 while (PyDict_Next(x, &i, &key, &value)) {
13214 if (PyUnicode_Check(key)) {
13215 /* convert string keys to integer keys */
13216 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013217 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013218 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13219 "table must be of length 1");
13220 goto err;
13221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 kind = PyUnicode_KIND(key);
13223 data = PyUnicode_DATA(key);
13224 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013225 if (!newkey)
13226 goto err;
13227 res = PyDict_SetItem(new, newkey, value);
13228 Py_DECREF(newkey);
13229 if (res < 0)
13230 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013231 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013232 /* just keep integer keys */
13233 if (PyDict_SetItem(new, key, value) < 0)
13234 goto err;
13235 } else {
13236 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13237 "be strings or integers");
13238 goto err;
13239 }
13240 }
13241 }
13242 return new;
13243 err:
13244 Py_DECREF(new);
13245 return NULL;
13246}
13247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013248PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013251Return a copy of the string S in which each character has been mapped\n\
13252through the given translation table. The table must implement\n\
13253lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13254mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13255this operation raises LookupError, the character is left untouched.\n\
13256Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257
13258static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262}
13263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013264PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013265 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013267Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268
13269static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013270unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013272 if (PyUnicode_READY(self) == -1)
13273 return NULL;
13274 if (PyUnicode_IS_ASCII(self))
13275 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013276 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277}
13278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013279PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013282Pad a numeric string S with zeros on the left, to fill a field\n\
13283of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
13285static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013286unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013288 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013289 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013290 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291 int kind;
13292 void *data;
13293 Py_UCS4 chr;
13294
Martin v. Löwis18e16552006-02-15 17:27:45 +000013295 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296 return NULL;
13297
Benjamin Petersonbac79492012-01-14 13:34:47 -050013298 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
Victor Stinnerc4b49542011-12-11 22:44:26 +010013301 if (PyUnicode_GET_LENGTH(self) >= width)
13302 return unicode_result_unchanged(self);
13303
13304 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305
13306 u = pad(self, fill, 0, '0');
13307
Walter Dörwald068325e2002-04-15 13:36:47 +000013308 if (u == NULL)
13309 return NULL;
13310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 kind = PyUnicode_KIND(u);
13312 data = PyUnicode_DATA(u);
13313 chr = PyUnicode_READ(kind, data, fill);
13314
13315 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 PyUnicode_WRITE(kind, data, 0, chr);
13318 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319 }
13320
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013321 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013322 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324
13325#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013326static PyObject *
13327unicode__decimal2ascii(PyObject *self)
13328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013330}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331#endif
13332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013333PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013336Return True if S starts with the specified prefix, False otherwise.\n\
13337With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013338With optional end, stop comparing S at that position.\n\
13339prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340
13341static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013342unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013345 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013346 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013347 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013348 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013349 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350
Jesus Ceaac451502011-04-20 17:09:23 +020013351 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013353 if (PyTuple_Check(subobj)) {
13354 Py_ssize_t i;
13355 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013356 substring = PyTuple_GET_ITEM(subobj, i);
13357 if (!PyUnicode_Check(substring)) {
13358 PyErr_Format(PyExc_TypeError,
13359 "tuple for startswith must only contain str, "
13360 "not %.100s",
13361 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013362 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013363 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013364 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013365 if (result == -1)
13366 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013367 if (result) {
13368 Py_RETURN_TRUE;
13369 }
13370 }
13371 /* nothing matched */
13372 Py_RETURN_FALSE;
13373 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013374 if (!PyUnicode_Check(subobj)) {
13375 PyErr_Format(PyExc_TypeError,
13376 "startswith first arg must be str or "
13377 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013379 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013380 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013381 if (result == -1)
13382 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384}
13385
13386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013387PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013390Return True if S ends with the specified suffix, False otherwise.\n\
13391With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013392With optional end, stop comparing S at that position.\n\
13393suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394
13395static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013396unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013399 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013400 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013401 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013402 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404
Jesus Ceaac451502011-04-20 17:09:23 +020013405 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013407 if (PyTuple_Check(subobj)) {
13408 Py_ssize_t i;
13409 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013410 substring = PyTuple_GET_ITEM(subobj, i);
13411 if (!PyUnicode_Check(substring)) {
13412 PyErr_Format(PyExc_TypeError,
13413 "tuple for endswith must only contain str, "
13414 "not %.100s",
13415 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013417 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013418 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013419 if (result == -1)
13420 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013421 if (result) {
13422 Py_RETURN_TRUE;
13423 }
13424 }
13425 Py_RETURN_FALSE;
13426 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013427 if (!PyUnicode_Check(subobj)) {
13428 PyErr_Format(PyExc_TypeError,
13429 "endswith first arg must be str or "
13430 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013431 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013432 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013433 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013434 if (result == -1)
13435 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013436 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437}
13438
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013439static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013440_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013441{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013442 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13443 writer->data = PyUnicode_DATA(writer->buffer);
13444
13445 if (!writer->readonly) {
13446 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013447 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013448 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013449 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013450 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13451 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13452 writer->kind = PyUnicode_WCHAR_KIND;
13453 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13454
Victor Stinner8f674cc2013-04-17 23:02:17 +020013455 /* Copy-on-write mode: set buffer size to 0 so
13456 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13457 * next write. */
13458 writer->size = 0;
13459 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013460}
13461
Victor Stinnerd3f08822012-05-29 12:57:52 +020013462void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013463_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013464{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013465 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013466
13467 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013468 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013469
13470 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13471 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13472 writer->kind = PyUnicode_WCHAR_KIND;
13473 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013474}
13475
Victor Stinnerd3f08822012-05-29 12:57:52 +020013476int
13477_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13478 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013479{
13480 Py_ssize_t newlen;
13481 PyObject *newbuffer;
13482
Victor Stinner2740e462016-09-06 16:58:36 -070013483 assert(maxchar <= MAX_UNICODE);
13484
Victor Stinnerca9381e2015-09-22 00:58:32 +020013485 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013486 assert((maxchar > writer->maxchar && length >= 0)
13487 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013488
Victor Stinner202fdca2012-05-07 12:47:02 +020013489 if (length > PY_SSIZE_T_MAX - writer->pos) {
13490 PyErr_NoMemory();
13491 return -1;
13492 }
13493 newlen = writer->pos + length;
13494
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013495 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013496
Victor Stinnerd3f08822012-05-29 12:57:52 +020013497 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013498 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013499 if (writer->overallocate
13500 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13501 /* overallocate to limit the number of realloc() */
13502 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013504 if (newlen < writer->min_length)
13505 newlen = writer->min_length;
13506
Victor Stinnerd3f08822012-05-29 12:57:52 +020013507 writer->buffer = PyUnicode_New(newlen, maxchar);
13508 if (writer->buffer == NULL)
13509 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013511 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013512 if (writer->overallocate
13513 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13514 /* overallocate to limit the number of realloc() */
13515 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013516 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013517 if (newlen < writer->min_length)
13518 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013520 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013521 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013522 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013523 newbuffer = PyUnicode_New(newlen, maxchar);
13524 if (newbuffer == NULL)
13525 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013526 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13527 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013528 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013529 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013530 }
13531 else {
13532 newbuffer = resize_compact(writer->buffer, newlen);
13533 if (newbuffer == NULL)
13534 return -1;
13535 }
13536 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013537 }
13538 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013539 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013540 newbuffer = PyUnicode_New(writer->size, maxchar);
13541 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013542 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13544 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013545 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013546 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013547 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013548 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013549
13550#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013551}
13552
Victor Stinnerca9381e2015-09-22 00:58:32 +020013553int
13554_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13555 enum PyUnicode_Kind kind)
13556{
13557 Py_UCS4 maxchar;
13558
13559 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13560 assert(writer->kind < kind);
13561
13562 switch (kind)
13563 {
13564 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13565 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13566 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13567 default:
13568 assert(0 && "invalid kind");
13569 return -1;
13570 }
13571
13572 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13573}
13574
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013575static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013576_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013577{
Victor Stinner2740e462016-09-06 16:58:36 -070013578 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013579 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13580 return -1;
13581 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13582 writer->pos++;
13583 return 0;
13584}
13585
13586int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013587_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13588{
13589 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13590}
13591
13592int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013593_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13594{
13595 Py_UCS4 maxchar;
13596 Py_ssize_t len;
13597
13598 if (PyUnicode_READY(str) == -1)
13599 return -1;
13600 len = PyUnicode_GET_LENGTH(str);
13601 if (len == 0)
13602 return 0;
13603 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13604 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013605 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013606 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013607 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013608 Py_INCREF(str);
13609 writer->buffer = str;
13610 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013611 writer->pos += len;
13612 return 0;
13613 }
13614 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13615 return -1;
13616 }
13617 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13618 str, 0, len);
13619 writer->pos += len;
13620 return 0;
13621}
13622
Victor Stinnere215d962012-10-06 23:03:36 +020013623int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013624_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13625 Py_ssize_t start, Py_ssize_t end)
13626{
13627 Py_UCS4 maxchar;
13628 Py_ssize_t len;
13629
13630 if (PyUnicode_READY(str) == -1)
13631 return -1;
13632
13633 assert(0 <= start);
13634 assert(end <= PyUnicode_GET_LENGTH(str));
13635 assert(start <= end);
13636
13637 if (end == 0)
13638 return 0;
13639
13640 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13641 return _PyUnicodeWriter_WriteStr(writer, str);
13642
13643 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13644 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13645 else
13646 maxchar = writer->maxchar;
13647 len = end - start;
13648
13649 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13650 return -1;
13651
13652 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13653 str, start, len);
13654 writer->pos += len;
13655 return 0;
13656}
13657
13658int
Victor Stinner4a587072013-11-19 12:54:53 +010013659_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13660 const char *ascii, Py_ssize_t len)
13661{
13662 if (len == -1)
13663 len = strlen(ascii);
13664
13665 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13666
13667 if (writer->buffer == NULL && !writer->overallocate) {
13668 PyObject *str;
13669
13670 str = _PyUnicode_FromASCII(ascii, len);
13671 if (str == NULL)
13672 return -1;
13673
13674 writer->readonly = 1;
13675 writer->buffer = str;
13676 _PyUnicodeWriter_Update(writer);
13677 writer->pos += len;
13678 return 0;
13679 }
13680
13681 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13682 return -1;
13683
13684 switch (writer->kind)
13685 {
13686 case PyUnicode_1BYTE_KIND:
13687 {
13688 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13689 Py_UCS1 *data = writer->data;
13690
Christian Heimesf051e432016-09-13 20:22:02 +020013691 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013692 break;
13693 }
13694 case PyUnicode_2BYTE_KIND:
13695 {
13696 _PyUnicode_CONVERT_BYTES(
13697 Py_UCS1, Py_UCS2,
13698 ascii, ascii + len,
13699 (Py_UCS2 *)writer->data + writer->pos);
13700 break;
13701 }
13702 case PyUnicode_4BYTE_KIND:
13703 {
13704 _PyUnicode_CONVERT_BYTES(
13705 Py_UCS1, Py_UCS4,
13706 ascii, ascii + len,
13707 (Py_UCS4 *)writer->data + writer->pos);
13708 break;
13709 }
13710 default:
13711 assert(0);
13712 }
13713
13714 writer->pos += len;
13715 return 0;
13716}
13717
13718int
13719_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13720 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013721{
13722 Py_UCS4 maxchar;
13723
13724 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13725 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13726 return -1;
13727 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13728 writer->pos += len;
13729 return 0;
13730}
13731
Victor Stinnerd3f08822012-05-29 12:57:52 +020013732PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013733_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013734{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013735 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013736
Victor Stinnerd3f08822012-05-29 12:57:52 +020013737 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013738 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013739 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013740 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013741
13742 str = writer->buffer;
13743 writer->buffer = NULL;
13744
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013745 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013746 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13747 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013748 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013749
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013750 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13751 PyObject *str2;
13752 str2 = resize_compact(str, writer->pos);
13753 if (str2 == NULL) {
13754 Py_DECREF(str);
13755 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013756 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013757 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013758 }
13759
Victor Stinner15a0bd32013-07-08 22:29:55 +020013760 assert(_PyUnicode_CheckConsistency(str, 1));
13761 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013762}
13763
Victor Stinnerd3f08822012-05-29 12:57:52 +020013764void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013765_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013766{
13767 Py_CLEAR(writer->buffer);
13768}
13769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013770#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013771
13772PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013774\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013775Return a formatted version of S, using substitutions from args and kwargs.\n\
13776The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013777
Eric Smith27bbca62010-11-04 17:06:58 +000013778PyDoc_STRVAR(format_map__doc__,
13779 "S.format_map(mapping) -> str\n\
13780\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013781Return a formatted version of S, using substitutions from mapping.\n\
13782The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013783
Eric Smith4a7d76d2008-05-30 18:10:19 +000013784static PyObject *
13785unicode__format__(PyObject* self, PyObject* args)
13786{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013787 PyObject *format_spec;
13788 _PyUnicodeWriter writer;
13789 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013790
13791 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13792 return NULL;
13793
Victor Stinnerd3f08822012-05-29 12:57:52 +020013794 if (PyUnicode_READY(self) == -1)
13795 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013796 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013797 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13798 self, format_spec, 0,
13799 PyUnicode_GET_LENGTH(format_spec));
13800 if (ret == -1) {
13801 _PyUnicodeWriter_Dealloc(&writer);
13802 return NULL;
13803 }
13804 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013805}
13806
Eric Smith8c663262007-08-25 02:26:07 +000013807PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013809\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013810Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013811
13812static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013813unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013815 Py_ssize_t size;
13816
13817 /* If it's a compact object, account for base structure +
13818 character data. */
13819 if (PyUnicode_IS_COMPACT_ASCII(v))
13820 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13821 else if (PyUnicode_IS_COMPACT(v))
13822 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013823 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013824 else {
13825 /* If it is a two-block object, account for base object, and
13826 for character block if present. */
13827 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013828 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013830 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013831 }
13832 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013833 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013834 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013835 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013836 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013837 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013838
13839 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013840}
13841
13842PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013844
13845static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013846unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013847{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013848 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 if (!copy)
13850 return NULL;
13851 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013852}
13853
Guido van Rossumd57fd912000-03-10 22:53:23 +000013854static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013855 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013856 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013857 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13858 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013859 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13860 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013861 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013862 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13863 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13864 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013865 {"expandtabs", (PyCFunction) unicode_expandtabs,
13866 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013867 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013868 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013869 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13870 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13871 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013872 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013873 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13874 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13875 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013876 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013877 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013878 {"splitlines", (PyCFunction) unicode_splitlines,
13879 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013880 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013881 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13882 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13883 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13884 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13885 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13886 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13887 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13888 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13889 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13890 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13891 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13892 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13893 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13894 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013895 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013896 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013897 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013898 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013899 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013900 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013901 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013902 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013903#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013904 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013905 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013906#endif
13907
Benjamin Peterson14339b62009-01-31 16:36:08 +000013908 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013909 {NULL, NULL}
13910};
13911
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013912static PyObject *
13913unicode_mod(PyObject *v, PyObject *w)
13914{
Brian Curtindfc80e32011-08-10 20:28:54 -050013915 if (!PyUnicode_Check(v))
13916 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013917 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013918}
13919
13920static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013921 0, /*nb_add*/
13922 0, /*nb_subtract*/
13923 0, /*nb_multiply*/
13924 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013925};
13926
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013928 (lenfunc) unicode_length, /* sq_length */
13929 PyUnicode_Concat, /* sq_concat */
13930 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13931 (ssizeargfunc) unicode_getitem, /* sq_item */
13932 0, /* sq_slice */
13933 0, /* sq_ass_item */
13934 0, /* sq_ass_slice */
13935 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936};
13937
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013938static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013939unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013941 if (PyUnicode_READY(self) == -1)
13942 return NULL;
13943
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013944 if (PyIndex_Check(item)) {
13945 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013946 if (i == -1 && PyErr_Occurred())
13947 return NULL;
13948 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013950 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013951 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013952 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013953 PyObject *result;
13954 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013955 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013956 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013957
Serhiy Storchakac26b19d2017-04-08 11:18:14 +030013958 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013959 return NULL;
13960 }
Serhiy Storchakac26b19d2017-04-08 11:18:14 +030013961 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13962 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013963
13964 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013965 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013966 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013967 slicelength == PyUnicode_GET_LENGTH(self)) {
13968 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013969 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013970 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013971 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013972 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013973 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013974 src_kind = PyUnicode_KIND(self);
13975 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013976 if (!PyUnicode_IS_ASCII(self)) {
13977 kind_limit = kind_maxchar_limit(src_kind);
13978 max_char = 0;
13979 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13980 ch = PyUnicode_READ(src_kind, src_data, cur);
13981 if (ch > max_char) {
13982 max_char = ch;
13983 if (max_char >= kind_limit)
13984 break;
13985 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013986 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013987 }
Victor Stinner55c99112011-10-13 01:17:06 +020013988 else
13989 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013990 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013991 if (result == NULL)
13992 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013993 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013994 dest_data = PyUnicode_DATA(result);
13995
13996 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013997 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13998 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013999 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014000 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014001 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014002 } else {
14003 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14004 return NULL;
14005 }
14006}
14007
14008static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014009 (lenfunc)unicode_length, /* mp_length */
14010 (binaryfunc)unicode_subscript, /* mp_subscript */
14011 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014012};
14013
Guido van Rossumd57fd912000-03-10 22:53:23 +000014014
Guido van Rossumd57fd912000-03-10 22:53:23 +000014015/* Helpers for PyUnicode_Format() */
14016
Victor Stinnera47082312012-10-04 02:19:54 +020014017struct unicode_formatter_t {
14018 PyObject *args;
14019 int args_owned;
14020 Py_ssize_t arglen, argidx;
14021 PyObject *dict;
14022
14023 enum PyUnicode_Kind fmtkind;
14024 Py_ssize_t fmtcnt, fmtpos;
14025 void *fmtdata;
14026 PyObject *fmtstr;
14027
14028 _PyUnicodeWriter writer;
14029};
14030
14031struct unicode_format_arg_t {
14032 Py_UCS4 ch;
14033 int flags;
14034 Py_ssize_t width;
14035 int prec;
14036 int sign;
14037};
14038
Guido van Rossumd57fd912000-03-10 22:53:23 +000014039static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014040unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014041{
Victor Stinnera47082312012-10-04 02:19:54 +020014042 Py_ssize_t argidx = ctx->argidx;
14043
14044 if (argidx < ctx->arglen) {
14045 ctx->argidx++;
14046 if (ctx->arglen < 0)
14047 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014048 else
Victor Stinnera47082312012-10-04 02:19:54 +020014049 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014050 }
14051 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014052 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014053 return NULL;
14054}
14055
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014056/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014057
Victor Stinnera47082312012-10-04 02:19:54 +020014058/* Format a float into the writer if the writer is not NULL, or into *p_output
14059 otherwise.
14060
14061 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014062static int
Victor Stinnera47082312012-10-04 02:19:54 +020014063formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14064 PyObject **p_output,
14065 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014066{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014067 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014068 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014069 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014070 int prec;
14071 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014072
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073 x = PyFloat_AsDouble(v);
14074 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014075 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014076
Victor Stinnera47082312012-10-04 02:19:54 +020014077 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014078 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014079 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014080
Victor Stinnera47082312012-10-04 02:19:54 +020014081 if (arg->flags & F_ALT)
14082 dtoa_flags = Py_DTSF_ALT;
14083 else
14084 dtoa_flags = 0;
14085 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014086 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014087 return -1;
14088 len = strlen(p);
14089 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014090 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014091 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014092 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014093 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014094 }
14095 else
14096 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014097 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014098 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099}
14100
Victor Stinnerd0880d52012-04-27 23:40:13 +020014101/* formatlong() emulates the format codes d, u, o, x and X, and
14102 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14103 * Python's regular ints.
14104 * Return value: a new PyUnicodeObject*, or NULL if error.
14105 * The output string is of the form
14106 * "-"? ("0x" | "0X")? digit+
14107 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14108 * set in flags. The case of hex digits will be correct,
14109 * There will be at least prec digits, zero-filled on the left if
14110 * necessary to get that many.
14111 * val object to be converted
14112 * flags bitmask of format flags; only F_ALT is looked at
14113 * prec minimum number of digits; 0-fill on left if needed
14114 * type a character in [duoxX]; u acts the same as d
14115 *
14116 * CAUTION: o, x and X conversions on regular ints can never
14117 * produce a '-' sign, but can for Python's unbounded ints.
14118 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014119PyObject *
14120_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014121{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014122 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014123 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014124 Py_ssize_t i;
14125 int sign; /* 1 if '-', else 0 */
14126 int len; /* number of characters */
14127 Py_ssize_t llen;
14128 int numdigits; /* len == numnondigits + numdigits */
14129 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014130
Victor Stinnerd0880d52012-04-27 23:40:13 +020014131 /* Avoid exceeding SSIZE_T_MAX */
14132 if (prec > INT_MAX-3) {
14133 PyErr_SetString(PyExc_OverflowError,
14134 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014135 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014136 }
14137
14138 assert(PyLong_Check(val));
14139
14140 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014141 default:
14142 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014143 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014144 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014145 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014146 /* int and int subclasses should print numerically when a numeric */
14147 /* format code is used (see issue18780) */
14148 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014149 break;
14150 case 'o':
14151 numnondigits = 2;
14152 result = PyNumber_ToBase(val, 8);
14153 break;
14154 case 'x':
14155 case 'X':
14156 numnondigits = 2;
14157 result = PyNumber_ToBase(val, 16);
14158 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159 }
14160 if (!result)
14161 return NULL;
14162
14163 assert(unicode_modifiable(result));
14164 assert(PyUnicode_IS_READY(result));
14165 assert(PyUnicode_IS_ASCII(result));
14166
14167 /* To modify the string in-place, there can only be one reference. */
14168 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014169 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014170 PyErr_BadInternalCall();
14171 return NULL;
14172 }
14173 buf = PyUnicode_DATA(result);
14174 llen = PyUnicode_GET_LENGTH(result);
14175 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014176 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014177 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014178 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014179 return NULL;
14180 }
14181 len = (int)llen;
14182 sign = buf[0] == '-';
14183 numnondigits += sign;
14184 numdigits = len - numnondigits;
14185 assert(numdigits > 0);
14186
14187 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014188 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014189 (type == 'o' || type == 'x' || type == 'X'))) {
14190 assert(buf[sign] == '0');
14191 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14192 buf[sign+1] == 'o');
14193 numnondigits -= 2;
14194 buf += 2;
14195 len -= 2;
14196 if (sign)
14197 buf[0] = '-';
14198 assert(len == numnondigits + numdigits);
14199 assert(numdigits > 0);
14200 }
14201
14202 /* Fill with leading zeroes to meet minimum width. */
14203 if (prec > numdigits) {
14204 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14205 numnondigits + prec);
14206 char *b1;
14207 if (!r1) {
14208 Py_DECREF(result);
14209 return NULL;
14210 }
14211 b1 = PyBytes_AS_STRING(r1);
14212 for (i = 0; i < numnondigits; ++i)
14213 *b1++ = *buf++;
14214 for (i = 0; i < prec - numdigits; i++)
14215 *b1++ = '0';
14216 for (i = 0; i < numdigits; i++)
14217 *b1++ = *buf++;
14218 *b1 = '\0';
14219 Py_DECREF(result);
14220 result = r1;
14221 buf = PyBytes_AS_STRING(result);
14222 len = numnondigits + prec;
14223 }
14224
14225 /* Fix up case for hex conversions. */
14226 if (type == 'X') {
14227 /* Need to convert all lower case letters to upper case.
14228 and need to convert 0x to 0X (and -0x to -0X). */
14229 for (i = 0; i < len; i++)
14230 if (buf[i] >= 'a' && buf[i] <= 'x')
14231 buf[i] -= 'a'-'A';
14232 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014233 if (!PyUnicode_Check(result)
14234 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014235 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014236 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014237 Py_DECREF(result);
14238 result = unicode;
14239 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014240 else if (len != PyUnicode_GET_LENGTH(result)) {
14241 if (PyUnicode_Resize(&result, len) < 0)
14242 Py_CLEAR(result);
14243 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014244 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014245}
14246
Ethan Furmandf3ed242014-01-05 06:50:30 -080014247/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014248 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014249 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014250 * -1 and raise an exception on error */
14251static int
Victor Stinnera47082312012-10-04 02:19:54 +020014252mainformatlong(PyObject *v,
14253 struct unicode_format_arg_t *arg,
14254 PyObject **p_output,
14255 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014256{
14257 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014258 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014259
14260 if (!PyNumber_Check(v))
14261 goto wrongtype;
14262
Ethan Furman9ab74802014-03-21 06:38:46 -070014263 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014265 if (type == 'o' || type == 'x' || type == 'X') {
14266 iobj = PyNumber_Index(v);
14267 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014268 if (PyErr_ExceptionMatches(PyExc_TypeError))
14269 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014270 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014271 }
14272 }
14273 else {
14274 iobj = PyNumber_Long(v);
14275 if (iobj == NULL ) {
14276 if (PyErr_ExceptionMatches(PyExc_TypeError))
14277 goto wrongtype;
14278 return -1;
14279 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280 }
14281 assert(PyLong_Check(iobj));
14282 }
14283 else {
14284 iobj = v;
14285 Py_INCREF(iobj);
14286 }
14287
14288 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014289 && arg->width == -1 && arg->prec == -1
14290 && !(arg->flags & (F_SIGN | F_BLANK))
14291 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014292 {
14293 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014294 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014295 int base;
14296
Victor Stinnera47082312012-10-04 02:19:54 +020014297 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014298 {
14299 default:
14300 assert(0 && "'type' not in [diuoxX]");
14301 case 'd':
14302 case 'i':
14303 case 'u':
14304 base = 10;
14305 break;
14306 case 'o':
14307 base = 8;
14308 break;
14309 case 'x':
14310 case 'X':
14311 base = 16;
14312 break;
14313 }
14314
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014315 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14316 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014317 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014318 }
14319 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014320 return 1;
14321 }
14322
Ethan Furmanb95b5612015-01-23 20:05:18 -080014323 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014324 Py_DECREF(iobj);
14325 if (res == NULL)
14326 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014327 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014328 return 0;
14329
14330wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014331 switch(type)
14332 {
14333 case 'o':
14334 case 'x':
14335 case 'X':
14336 PyErr_Format(PyExc_TypeError,
14337 "%%%c format: an integer is required, "
14338 "not %.200s",
14339 type, Py_TYPE(v)->tp_name);
14340 break;
14341 default:
14342 PyErr_Format(PyExc_TypeError,
14343 "%%%c format: a number is required, "
14344 "not %.200s",
14345 type, Py_TYPE(v)->tp_name);
14346 break;
14347 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014348 return -1;
14349}
14350
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014351static Py_UCS4
14352formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014353{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014354 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014355 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014356 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014357 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014358 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014359 goto onError;
14360 }
14361 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014362 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014363 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014364 /* make sure number is a type of integer */
14365 if (!PyLong_Check(v)) {
14366 iobj = PyNumber_Index(v);
14367 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014368 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014369 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014370 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014371 Py_DECREF(iobj);
14372 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014373 else {
14374 x = PyLong_AsLong(v);
14375 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014376 if (x == -1 && PyErr_Occurred())
14377 goto onError;
14378
Victor Stinner8faf8212011-12-08 22:14:11 +010014379 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014380 PyErr_SetString(PyExc_OverflowError,
14381 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014382 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014383 }
14384
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014385 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014386 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014387
Benjamin Peterson29060642009-01-31 22:14:21 +000014388 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014389 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014390 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014391 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014392}
14393
Victor Stinnera47082312012-10-04 02:19:54 +020014394/* Parse options of an argument: flags, width, precision.
14395 Handle also "%(name)" syntax.
14396
14397 Return 0 if the argument has been formatted into arg->str.
14398 Return 1 if the argument has been written into ctx->writer,
14399 Raise an exception and return -1 on error. */
14400static int
14401unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14402 struct unicode_format_arg_t *arg)
14403{
14404#define FORMAT_READ(ctx) \
14405 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14406
14407 PyObject *v;
14408
Victor Stinnera47082312012-10-04 02:19:54 +020014409 if (arg->ch == '(') {
14410 /* Get argument value from a dictionary. Example: "%(name)s". */
14411 Py_ssize_t keystart;
14412 Py_ssize_t keylen;
14413 PyObject *key;
14414 int pcount = 1;
14415
14416 if (ctx->dict == NULL) {
14417 PyErr_SetString(PyExc_TypeError,
14418 "format requires a mapping");
14419 return -1;
14420 }
14421 ++ctx->fmtpos;
14422 --ctx->fmtcnt;
14423 keystart = ctx->fmtpos;
14424 /* Skip over balanced parentheses */
14425 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14426 arg->ch = FORMAT_READ(ctx);
14427 if (arg->ch == ')')
14428 --pcount;
14429 else if (arg->ch == '(')
14430 ++pcount;
14431 ctx->fmtpos++;
14432 }
14433 keylen = ctx->fmtpos - keystart - 1;
14434 if (ctx->fmtcnt < 0 || pcount > 0) {
14435 PyErr_SetString(PyExc_ValueError,
14436 "incomplete format key");
14437 return -1;
14438 }
14439 key = PyUnicode_Substring(ctx->fmtstr,
14440 keystart, keystart + keylen);
14441 if (key == NULL)
14442 return -1;
14443 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014444 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014445 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014446 }
14447 ctx->args = PyObject_GetItem(ctx->dict, key);
14448 Py_DECREF(key);
14449 if (ctx->args == NULL)
14450 return -1;
14451 ctx->args_owned = 1;
14452 ctx->arglen = -1;
14453 ctx->argidx = -2;
14454 }
14455
14456 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014457 while (--ctx->fmtcnt >= 0) {
14458 arg->ch = FORMAT_READ(ctx);
14459 ctx->fmtpos++;
14460 switch (arg->ch) {
14461 case '-': arg->flags |= F_LJUST; continue;
14462 case '+': arg->flags |= F_SIGN; continue;
14463 case ' ': arg->flags |= F_BLANK; continue;
14464 case '#': arg->flags |= F_ALT; continue;
14465 case '0': arg->flags |= F_ZERO; continue;
14466 }
14467 break;
14468 }
14469
14470 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014471 if (arg->ch == '*') {
14472 v = unicode_format_getnextarg(ctx);
14473 if (v == NULL)
14474 return -1;
14475 if (!PyLong_Check(v)) {
14476 PyErr_SetString(PyExc_TypeError,
14477 "* wants int");
14478 return -1;
14479 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014480 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014481 if (arg->width == -1 && PyErr_Occurred())
14482 return -1;
14483 if (arg->width < 0) {
14484 arg->flags |= F_LJUST;
14485 arg->width = -arg->width;
14486 }
14487 if (--ctx->fmtcnt >= 0) {
14488 arg->ch = FORMAT_READ(ctx);
14489 ctx->fmtpos++;
14490 }
14491 }
14492 else if (arg->ch >= '0' && arg->ch <= '9') {
14493 arg->width = arg->ch - '0';
14494 while (--ctx->fmtcnt >= 0) {
14495 arg->ch = FORMAT_READ(ctx);
14496 ctx->fmtpos++;
14497 if (arg->ch < '0' || arg->ch > '9')
14498 break;
14499 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14500 mixing signed and unsigned comparison. Since arg->ch is between
14501 '0' and '9', casting to int is safe. */
14502 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14503 PyErr_SetString(PyExc_ValueError,
14504 "width too big");
14505 return -1;
14506 }
14507 arg->width = arg->width*10 + (arg->ch - '0');
14508 }
14509 }
14510
14511 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014512 if (arg->ch == '.') {
14513 arg->prec = 0;
14514 if (--ctx->fmtcnt >= 0) {
14515 arg->ch = FORMAT_READ(ctx);
14516 ctx->fmtpos++;
14517 }
14518 if (arg->ch == '*') {
14519 v = unicode_format_getnextarg(ctx);
14520 if (v == NULL)
14521 return -1;
14522 if (!PyLong_Check(v)) {
14523 PyErr_SetString(PyExc_TypeError,
14524 "* wants int");
14525 return -1;
14526 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014527 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014528 if (arg->prec == -1 && PyErr_Occurred())
14529 return -1;
14530 if (arg->prec < 0)
14531 arg->prec = 0;
14532 if (--ctx->fmtcnt >= 0) {
14533 arg->ch = FORMAT_READ(ctx);
14534 ctx->fmtpos++;
14535 }
14536 }
14537 else if (arg->ch >= '0' && arg->ch <= '9') {
14538 arg->prec = arg->ch - '0';
14539 while (--ctx->fmtcnt >= 0) {
14540 arg->ch = FORMAT_READ(ctx);
14541 ctx->fmtpos++;
14542 if (arg->ch < '0' || arg->ch > '9')
14543 break;
14544 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14545 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014546 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014547 return -1;
14548 }
14549 arg->prec = arg->prec*10 + (arg->ch - '0');
14550 }
14551 }
14552 }
14553
14554 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14555 if (ctx->fmtcnt >= 0) {
14556 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14557 if (--ctx->fmtcnt >= 0) {
14558 arg->ch = FORMAT_READ(ctx);
14559 ctx->fmtpos++;
14560 }
14561 }
14562 }
14563 if (ctx->fmtcnt < 0) {
14564 PyErr_SetString(PyExc_ValueError,
14565 "incomplete format");
14566 return -1;
14567 }
14568 return 0;
14569
14570#undef FORMAT_READ
14571}
14572
14573/* Format one argument. Supported conversion specifiers:
14574
14575 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014576 - "i", "d", "u": int or float
14577 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014578 - "e", "E", "f", "F", "g", "G": float
14579 - "c": int or str (1 character)
14580
Victor Stinner8dbd4212012-12-04 09:30:24 +010014581 When possible, the output is written directly into the Unicode writer
14582 (ctx->writer). A string is created when padding is required.
14583
Victor Stinnera47082312012-10-04 02:19:54 +020014584 Return 0 if the argument has been formatted into *p_str,
14585 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014586 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014587static int
14588unicode_format_arg_format(struct unicode_formatter_t *ctx,
14589 struct unicode_format_arg_t *arg,
14590 PyObject **p_str)
14591{
14592 PyObject *v;
14593 _PyUnicodeWriter *writer = &ctx->writer;
14594
14595 if (ctx->fmtcnt == 0)
14596 ctx->writer.overallocate = 0;
14597
14598 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014599 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014600 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014601 return 1;
14602 }
14603
14604 v = unicode_format_getnextarg(ctx);
14605 if (v == NULL)
14606 return -1;
14607
Victor Stinnera47082312012-10-04 02:19:54 +020014608
14609 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014610 case 's':
14611 case 'r':
14612 case 'a':
14613 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14614 /* Fast path */
14615 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14616 return -1;
14617 return 1;
14618 }
14619
14620 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14621 *p_str = v;
14622 Py_INCREF(*p_str);
14623 }
14624 else {
14625 if (arg->ch == 's')
14626 *p_str = PyObject_Str(v);
14627 else if (arg->ch == 'r')
14628 *p_str = PyObject_Repr(v);
14629 else
14630 *p_str = PyObject_ASCII(v);
14631 }
14632 break;
14633
14634 case 'i':
14635 case 'd':
14636 case 'u':
14637 case 'o':
14638 case 'x':
14639 case 'X':
14640 {
14641 int ret = mainformatlong(v, arg, p_str, writer);
14642 if (ret != 0)
14643 return ret;
14644 arg->sign = 1;
14645 break;
14646 }
14647
14648 case 'e':
14649 case 'E':
14650 case 'f':
14651 case 'F':
14652 case 'g':
14653 case 'G':
14654 if (arg->width == -1 && arg->prec == -1
14655 && !(arg->flags & (F_SIGN | F_BLANK)))
14656 {
14657 /* Fast path */
14658 if (formatfloat(v, arg, NULL, writer) == -1)
14659 return -1;
14660 return 1;
14661 }
14662
14663 arg->sign = 1;
14664 if (formatfloat(v, arg, p_str, NULL) == -1)
14665 return -1;
14666 break;
14667
14668 case 'c':
14669 {
14670 Py_UCS4 ch = formatchar(v);
14671 if (ch == (Py_UCS4) -1)
14672 return -1;
14673 if (arg->width == -1 && arg->prec == -1) {
14674 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014675 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014676 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014677 return 1;
14678 }
14679 *p_str = PyUnicode_FromOrdinal(ch);
14680 break;
14681 }
14682
14683 default:
14684 PyErr_Format(PyExc_ValueError,
14685 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014686 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014687 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14688 (int)arg->ch,
14689 ctx->fmtpos - 1);
14690 return -1;
14691 }
14692 if (*p_str == NULL)
14693 return -1;
14694 assert (PyUnicode_Check(*p_str));
14695 return 0;
14696}
14697
14698static int
14699unicode_format_arg_output(struct unicode_formatter_t *ctx,
14700 struct unicode_format_arg_t *arg,
14701 PyObject *str)
14702{
14703 Py_ssize_t len;
14704 enum PyUnicode_Kind kind;
14705 void *pbuf;
14706 Py_ssize_t pindex;
14707 Py_UCS4 signchar;
14708 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014709 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014710 Py_ssize_t sublen;
14711 _PyUnicodeWriter *writer = &ctx->writer;
14712 Py_UCS4 fill;
14713
14714 fill = ' ';
14715 if (arg->sign && arg->flags & F_ZERO)
14716 fill = '0';
14717
14718 if (PyUnicode_READY(str) == -1)
14719 return -1;
14720
14721 len = PyUnicode_GET_LENGTH(str);
14722 if ((arg->width == -1 || arg->width <= len)
14723 && (arg->prec == -1 || arg->prec >= len)
14724 && !(arg->flags & (F_SIGN | F_BLANK)))
14725 {
14726 /* Fast path */
14727 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14728 return -1;
14729 return 0;
14730 }
14731
14732 /* Truncate the string for "s", "r" and "a" formats
14733 if the precision is set */
14734 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14735 if (arg->prec >= 0 && len > arg->prec)
14736 len = arg->prec;
14737 }
14738
14739 /* Adjust sign and width */
14740 kind = PyUnicode_KIND(str);
14741 pbuf = PyUnicode_DATA(str);
14742 pindex = 0;
14743 signchar = '\0';
14744 if (arg->sign) {
14745 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14746 if (ch == '-' || ch == '+') {
14747 signchar = ch;
14748 len--;
14749 pindex++;
14750 }
14751 else if (arg->flags & F_SIGN)
14752 signchar = '+';
14753 else if (arg->flags & F_BLANK)
14754 signchar = ' ';
14755 else
14756 arg->sign = 0;
14757 }
14758 if (arg->width < len)
14759 arg->width = len;
14760
14761 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014762 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014763 if (!(arg->flags & F_LJUST)) {
14764 if (arg->sign) {
14765 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014766 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014767 }
14768 else {
14769 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014770 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014771 }
14772 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014773 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14774 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014775 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014776 }
14777
Victor Stinnera47082312012-10-04 02:19:54 +020014778 buflen = arg->width;
14779 if (arg->sign && len == arg->width)
14780 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014781 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014782 return -1;
14783
14784 /* Write the sign if needed */
14785 if (arg->sign) {
14786 if (fill != ' ') {
14787 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14788 writer->pos += 1;
14789 }
14790 if (arg->width > len)
14791 arg->width--;
14792 }
14793
14794 /* Write the numeric prefix for "x", "X" and "o" formats
14795 if the alternate form is used.
14796 For example, write "0x" for the "%#x" format. */
14797 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14798 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14799 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14800 if (fill != ' ') {
14801 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14802 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14803 writer->pos += 2;
14804 pindex += 2;
14805 }
14806 arg->width -= 2;
14807 if (arg->width < 0)
14808 arg->width = 0;
14809 len -= 2;
14810 }
14811
14812 /* Pad left with the fill character if needed */
14813 if (arg->width > len && !(arg->flags & F_LJUST)) {
14814 sublen = arg->width - len;
14815 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14816 writer->pos += sublen;
14817 arg->width = len;
14818 }
14819
14820 /* If padding with spaces: write sign if needed and/or numeric prefix if
14821 the alternate form is used */
14822 if (fill == ' ') {
14823 if (arg->sign) {
14824 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14825 writer->pos += 1;
14826 }
14827 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14828 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14829 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14830 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14831 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14832 writer->pos += 2;
14833 pindex += 2;
14834 }
14835 }
14836
14837 /* Write characters */
14838 if (len) {
14839 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14840 str, pindex, len);
14841 writer->pos += len;
14842 }
14843
14844 /* Pad right with the fill character if needed */
14845 if (arg->width > len) {
14846 sublen = arg->width - len;
14847 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14848 writer->pos += sublen;
14849 }
14850 return 0;
14851}
14852
14853/* Helper of PyUnicode_Format(): format one arg.
14854 Return 0 on success, raise an exception and return -1 on error. */
14855static int
14856unicode_format_arg(struct unicode_formatter_t *ctx)
14857{
14858 struct unicode_format_arg_t arg;
14859 PyObject *str;
14860 int ret;
14861
Victor Stinner8dbd4212012-12-04 09:30:24 +010014862 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14863 arg.flags = 0;
14864 arg.width = -1;
14865 arg.prec = -1;
14866 arg.sign = 0;
14867 str = NULL;
14868
Victor Stinnera47082312012-10-04 02:19:54 +020014869 ret = unicode_format_arg_parse(ctx, &arg);
14870 if (ret == -1)
14871 return -1;
14872
14873 ret = unicode_format_arg_format(ctx, &arg, &str);
14874 if (ret == -1)
14875 return -1;
14876
14877 if (ret != 1) {
14878 ret = unicode_format_arg_output(ctx, &arg, str);
14879 Py_DECREF(str);
14880 if (ret == -1)
14881 return -1;
14882 }
14883
14884 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14885 PyErr_SetString(PyExc_TypeError,
14886 "not all arguments converted during string formatting");
14887 return -1;
14888 }
14889 return 0;
14890}
14891
Alexander Belopolsky40018472011-02-26 01:02:56 +000014892PyObject *
14893PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014894{
Victor Stinnera47082312012-10-04 02:19:54 +020014895 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014896
Guido van Rossumd57fd912000-03-10 22:53:23 +000014897 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014898 PyErr_BadInternalCall();
14899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014900 }
Victor Stinnera47082312012-10-04 02:19:54 +020014901
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014902 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014903 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014904
14905 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014906 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14907 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14908 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14909 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014910
Victor Stinner8f674cc2013-04-17 23:02:17 +020014911 _PyUnicodeWriter_Init(&ctx.writer);
14912 ctx.writer.min_length = ctx.fmtcnt + 100;
14913 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014914
Guido van Rossumd57fd912000-03-10 22:53:23 +000014915 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014916 ctx.arglen = PyTuple_Size(args);
14917 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014918 }
14919 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014920 ctx.arglen = -1;
14921 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014922 }
Victor Stinnera47082312012-10-04 02:19:54 +020014923 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014924 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014925 ctx.dict = args;
14926 else
14927 ctx.dict = NULL;
14928 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014929
Victor Stinnera47082312012-10-04 02:19:54 +020014930 while (--ctx.fmtcnt >= 0) {
14931 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014932 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014933
14934 nonfmtpos = ctx.fmtpos++;
14935 while (ctx.fmtcnt >= 0 &&
14936 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14937 ctx.fmtpos++;
14938 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014939 }
Victor Stinnera47082312012-10-04 02:19:54 +020014940 if (ctx.fmtcnt < 0) {
14941 ctx.fmtpos--;
14942 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014943 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014944
Victor Stinnercfc4c132013-04-03 01:48:39 +020014945 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14946 nonfmtpos, ctx.fmtpos) < 0)
14947 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014948 }
14949 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014950 ctx.fmtpos++;
14951 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014952 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014953 }
14954 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014955
Victor Stinnera47082312012-10-04 02:19:54 +020014956 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014957 PyErr_SetString(PyExc_TypeError,
14958 "not all arguments converted during string formatting");
14959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014960 }
14961
Victor Stinnera47082312012-10-04 02:19:54 +020014962 if (ctx.args_owned) {
14963 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014964 }
Victor Stinnera47082312012-10-04 02:19:54 +020014965 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014966
Benjamin Peterson29060642009-01-31 22:14:21 +000014967 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014968 _PyUnicodeWriter_Dealloc(&ctx.writer);
14969 if (ctx.args_owned) {
14970 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014971 }
14972 return NULL;
14973}
14974
Jeremy Hylton938ace62002-07-17 16:30:39 +000014975static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014976unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14977
Tim Peters6d6c1a32001-08-02 04:15:00 +000014978static PyObject *
14979unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14980{
Benjamin Peterson29060642009-01-31 22:14:21 +000014981 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014982 static char *kwlist[] = {"object", "encoding", "errors", 0};
14983 char *encoding = NULL;
14984 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014985
Benjamin Peterson14339b62009-01-31 16:36:08 +000014986 if (type != &PyUnicode_Type)
14987 return unicode_subtype_new(type, args, kwds);
14988 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014989 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014990 return NULL;
14991 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014992 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014993 if (encoding == NULL && errors == NULL)
14994 return PyObject_Str(x);
14995 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014996 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014997}
14998
Guido van Rossume023fe02001-08-30 03:12:59 +000014999static PyObject *
15000unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15001{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015002 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015003 Py_ssize_t length, char_size;
15004 int share_wstr, share_utf8;
15005 unsigned int kind;
15006 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015007
Benjamin Peterson14339b62009-01-31 16:36:08 +000015008 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015009
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015010 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015011 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015012 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015013 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015014 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015015 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015016 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015017 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015018
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015019 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015020 if (self == NULL) {
15021 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015022 return NULL;
15023 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015024 kind = PyUnicode_KIND(unicode);
15025 length = PyUnicode_GET_LENGTH(unicode);
15026
15027 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015028#ifdef Py_DEBUG
15029 _PyUnicode_HASH(self) = -1;
15030#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015031 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015032#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015033 _PyUnicode_STATE(self).interned = 0;
15034 _PyUnicode_STATE(self).kind = kind;
15035 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015036 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015037 _PyUnicode_STATE(self).ready = 1;
15038 _PyUnicode_WSTR(self) = NULL;
15039 _PyUnicode_UTF8_LENGTH(self) = 0;
15040 _PyUnicode_UTF8(self) = NULL;
15041 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015042 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015043
15044 share_utf8 = 0;
15045 share_wstr = 0;
15046 if (kind == PyUnicode_1BYTE_KIND) {
15047 char_size = 1;
15048 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15049 share_utf8 = 1;
15050 }
15051 else if (kind == PyUnicode_2BYTE_KIND) {
15052 char_size = 2;
15053 if (sizeof(wchar_t) == 2)
15054 share_wstr = 1;
15055 }
15056 else {
15057 assert(kind == PyUnicode_4BYTE_KIND);
15058 char_size = 4;
15059 if (sizeof(wchar_t) == 4)
15060 share_wstr = 1;
15061 }
15062
15063 /* Ensure we won't overflow the length. */
15064 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15065 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015066 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068 data = PyObject_MALLOC((length + 1) * char_size);
15069 if (data == NULL) {
15070 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015071 goto onError;
15072 }
15073
Victor Stinnerc3c74152011-10-02 20:39:55 +020015074 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 if (share_utf8) {
15076 _PyUnicode_UTF8_LENGTH(self) = length;
15077 _PyUnicode_UTF8(self) = data;
15078 }
15079 if (share_wstr) {
15080 _PyUnicode_WSTR_LENGTH(self) = length;
15081 _PyUnicode_WSTR(self) = (wchar_t *)data;
15082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015083
Christian Heimesf051e432016-09-13 20:22:02 +020015084 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015085 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015086 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015087#ifdef Py_DEBUG
15088 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15089#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015090 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015091 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015092
15093onError:
15094 Py_DECREF(unicode);
15095 Py_DECREF(self);
15096 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015097}
15098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015099PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015100"str(object='') -> str\n\
15101str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015102\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015103Create a new string object from the given object. If encoding or\n\
15104errors is specified, then the object must expose a data buffer\n\
15105that will be decoded using the given encoding and error handler.\n\
15106Otherwise, returns the result of object.__str__() (if defined)\n\
15107or repr(object).\n\
15108encoding defaults to sys.getdefaultencoding().\n\
15109errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015110
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015111static PyObject *unicode_iter(PyObject *seq);
15112
Guido van Rossumd57fd912000-03-10 22:53:23 +000015113PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015114 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 "str", /* tp_name */
15116 sizeof(PyUnicodeObject), /* tp_size */
15117 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015118 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 (destructor)unicode_dealloc, /* tp_dealloc */
15120 0, /* tp_print */
15121 0, /* tp_getattr */
15122 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015123 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015124 unicode_repr, /* tp_repr */
15125 &unicode_as_number, /* tp_as_number */
15126 &unicode_as_sequence, /* tp_as_sequence */
15127 &unicode_as_mapping, /* tp_as_mapping */
15128 (hashfunc) unicode_hash, /* tp_hash*/
15129 0, /* tp_call*/
15130 (reprfunc) unicode_str, /* tp_str */
15131 PyObject_GenericGetAttr, /* tp_getattro */
15132 0, /* tp_setattro */
15133 0, /* tp_as_buffer */
15134 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015135 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 unicode_doc, /* tp_doc */
15137 0, /* tp_traverse */
15138 0, /* tp_clear */
15139 PyUnicode_RichCompare, /* tp_richcompare */
15140 0, /* tp_weaklistoffset */
15141 unicode_iter, /* tp_iter */
15142 0, /* tp_iternext */
15143 unicode_methods, /* tp_methods */
15144 0, /* tp_members */
15145 0, /* tp_getset */
15146 &PyBaseObject_Type, /* tp_base */
15147 0, /* tp_dict */
15148 0, /* tp_descr_get */
15149 0, /* tp_descr_set */
15150 0, /* tp_dictoffset */
15151 0, /* tp_init */
15152 0, /* tp_alloc */
15153 unicode_new, /* tp_new */
15154 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015155};
15156
15157/* Initialize the Unicode implementation */
15158
Victor Stinner3a50e702011-10-18 21:21:00 +020015159int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015160{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015161 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015162 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015163 0x000A, /* LINE FEED */
15164 0x000D, /* CARRIAGE RETURN */
15165 0x001C, /* FILE SEPARATOR */
15166 0x001D, /* GROUP SEPARATOR */
15167 0x001E, /* RECORD SEPARATOR */
15168 0x0085, /* NEXT LINE */
15169 0x2028, /* LINE SEPARATOR */
15170 0x2029, /* PARAGRAPH SEPARATOR */
15171 };
15172
Fred Drakee4315f52000-05-09 19:53:39 +000015173 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015174 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015175 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015176 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015177 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015178
Guido van Rossumcacfc072002-05-24 19:01:59 +000015179 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015180 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015181
15182 /* initialize the linebreak bloom filter */
15183 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015184 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015185 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015186
Christian Heimes26532f72013-07-20 14:57:16 +020015187 if (PyType_Ready(&EncodingMapType) < 0)
15188 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015189
Benjamin Petersonc4311282012-10-30 23:21:10 -040015190 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15191 Py_FatalError("Can't initialize field name iterator type");
15192
15193 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15194 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015195
Victor Stinner3a50e702011-10-18 21:21:00 +020015196 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015197}
15198
15199/* Finalize the Unicode implementation */
15200
Christian Heimesa156e092008-02-16 07:38:31 +000015201int
15202PyUnicode_ClearFreeList(void)
15203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015204 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015205}
15206
Guido van Rossumd57fd912000-03-10 22:53:23 +000015207void
Thomas Wouters78890102000-07-22 19:25:51 +000015208_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015209{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015210 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015211
Serhiy Storchaka05997252013-01-26 12:14:02 +020015212 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015213
Serhiy Storchaka05997252013-01-26 12:14:02 +020015214 for (i = 0; i < 256; i++)
15215 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015216 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015217 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015218}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015219
Walter Dörwald16807132007-05-25 13:52:07 +000015220void
15221PyUnicode_InternInPlace(PyObject **p)
15222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015223 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015224 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015225#ifdef Py_DEBUG
15226 assert(s != NULL);
15227 assert(_PyUnicode_CHECK(s));
15228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015230 return;
15231#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 /* If it's a subclass, we don't really know what putting
15233 it in the interned dict might do. */
15234 if (!PyUnicode_CheckExact(s))
15235 return;
15236 if (PyUnicode_CHECK_INTERNED(s))
15237 return;
15238 if (interned == NULL) {
15239 interned = PyDict_New();
15240 if (interned == NULL) {
15241 PyErr_Clear(); /* Don't leave an exception */
15242 return;
15243 }
15244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015246 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015247 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015248 if (t == NULL) {
15249 PyErr_Clear();
15250 return;
15251 }
15252 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015253 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015254 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015255 return;
15256 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 /* The two references in interned are not counted by refcnt.
15258 The deallocator will take care of this */
15259 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015260 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015261}
15262
15263void
15264PyUnicode_InternImmortal(PyObject **p)
15265{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 PyUnicode_InternInPlace(p);
15267 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015268 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 Py_INCREF(*p);
15270 }
Walter Dörwald16807132007-05-25 13:52:07 +000015271}
15272
15273PyObject *
15274PyUnicode_InternFromString(const char *cp)
15275{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 PyObject *s = PyUnicode_FromString(cp);
15277 if (s == NULL)
15278 return NULL;
15279 PyUnicode_InternInPlace(&s);
15280 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015281}
15282
Alexander Belopolsky40018472011-02-26 01:02:56 +000015283void
15284_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015285{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015287 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 Py_ssize_t i, n;
15289 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015290
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 if (interned == NULL || !PyDict_Check(interned))
15292 return;
15293 keys = PyDict_Keys(interned);
15294 if (keys == NULL || !PyList_Check(keys)) {
15295 PyErr_Clear();
15296 return;
15297 }
Walter Dörwald16807132007-05-25 13:52:07 +000015298
Benjamin Peterson14339b62009-01-31 16:36:08 +000015299 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15300 detector, interned unicode strings are not forcibly deallocated;
15301 rather, we give them their stolen references back, and then clear
15302 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015303
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 n = PyList_GET_SIZE(keys);
15305 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015306 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015308 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015309 if (PyUnicode_READY(s) == -1) {
15310 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015311 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015313 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015314 case SSTATE_NOT_INTERNED:
15315 /* XXX Shouldn't happen */
15316 break;
15317 case SSTATE_INTERNED_IMMORTAL:
15318 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015319 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 break;
15321 case SSTATE_INTERNED_MORTAL:
15322 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015323 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015324 break;
15325 default:
15326 Py_FatalError("Inconsistent interned string state.");
15327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015328 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 }
15330 fprintf(stderr, "total size of all interned strings: "
15331 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15332 "mortal/immortal\n", mortal_size, immortal_size);
15333 Py_DECREF(keys);
15334 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015335 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015336}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015337
15338
15339/********************* Unicode Iterator **************************/
15340
15341typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 PyObject_HEAD
15343 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015344 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015345} unicodeiterobject;
15346
15347static void
15348unicodeiter_dealloc(unicodeiterobject *it)
15349{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 _PyObject_GC_UNTRACK(it);
15351 Py_XDECREF(it->it_seq);
15352 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015353}
15354
15355static int
15356unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15357{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 Py_VISIT(it->it_seq);
15359 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015360}
15361
15362static PyObject *
15363unicodeiter_next(unicodeiterobject *it)
15364{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015365 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015366
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 assert(it != NULL);
15368 seq = it->it_seq;
15369 if (seq == NULL)
15370 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015371 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015373 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15374 int kind = PyUnicode_KIND(seq);
15375 void *data = PyUnicode_DATA(seq);
15376 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15377 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 if (item != NULL)
15379 ++it->it_index;
15380 return item;
15381 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015382
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015384 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015385 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015386}
15387
15388static PyObject *
15389unicodeiter_len(unicodeiterobject *it)
15390{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015391 Py_ssize_t len = 0;
15392 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015393 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015394 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015395}
15396
15397PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15398
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015399static PyObject *
15400unicodeiter_reduce(unicodeiterobject *it)
15401{
15402 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015403 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015404 it->it_seq, it->it_index);
15405 } else {
15406 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15407 if (u == NULL)
15408 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015409 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015410 }
15411}
15412
15413PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15414
15415static PyObject *
15416unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15417{
15418 Py_ssize_t index = PyLong_AsSsize_t(state);
15419 if (index == -1 && PyErr_Occurred())
15420 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015421 if (it->it_seq != NULL) {
15422 if (index < 0)
15423 index = 0;
15424 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15425 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15426 it->it_index = index;
15427 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015428 Py_RETURN_NONE;
15429}
15430
15431PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15432
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015433static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015435 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015436 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15437 reduce_doc},
15438 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15439 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015441};
15442
15443PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15445 "str_iterator", /* tp_name */
15446 sizeof(unicodeiterobject), /* tp_basicsize */
15447 0, /* tp_itemsize */
15448 /* methods */
15449 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15450 0, /* tp_print */
15451 0, /* tp_getattr */
15452 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015453 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015454 0, /* tp_repr */
15455 0, /* tp_as_number */
15456 0, /* tp_as_sequence */
15457 0, /* tp_as_mapping */
15458 0, /* tp_hash */
15459 0, /* tp_call */
15460 0, /* tp_str */
15461 PyObject_GenericGetAttr, /* tp_getattro */
15462 0, /* tp_setattro */
15463 0, /* tp_as_buffer */
15464 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15465 0, /* tp_doc */
15466 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15467 0, /* tp_clear */
15468 0, /* tp_richcompare */
15469 0, /* tp_weaklistoffset */
15470 PyObject_SelfIter, /* tp_iter */
15471 (iternextfunc)unicodeiter_next, /* tp_iternext */
15472 unicodeiter_methods, /* tp_methods */
15473 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015474};
15475
15476static PyObject *
15477unicode_iter(PyObject *seq)
15478{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015479 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015480
Benjamin Peterson14339b62009-01-31 16:36:08 +000015481 if (!PyUnicode_Check(seq)) {
15482 PyErr_BadInternalCall();
15483 return NULL;
15484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015485 if (PyUnicode_READY(seq) == -1)
15486 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15488 if (it == NULL)
15489 return NULL;
15490 it->it_index = 0;
15491 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015492 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015493 _PyObject_GC_TRACK(it);
15494 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015495}
15496
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015497
15498size_t
15499Py_UNICODE_strlen(const Py_UNICODE *u)
15500{
15501 int res = 0;
15502 while(*u++)
15503 res++;
15504 return res;
15505}
15506
15507Py_UNICODE*
15508Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15509{
15510 Py_UNICODE *u = s1;
15511 while ((*u++ = *s2++));
15512 return s1;
15513}
15514
15515Py_UNICODE*
15516Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15517{
15518 Py_UNICODE *u = s1;
15519 while ((*u++ = *s2++))
15520 if (n-- == 0)
15521 break;
15522 return s1;
15523}
15524
15525Py_UNICODE*
15526Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15527{
15528 Py_UNICODE *u1 = s1;
15529 u1 += Py_UNICODE_strlen(u1);
15530 Py_UNICODE_strcpy(u1, s2);
15531 return s1;
15532}
15533
15534int
15535Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15536{
15537 while (*s1 && *s2 && *s1 == *s2)
15538 s1++, s2++;
15539 if (*s1 && *s2)
15540 return (*s1 < *s2) ? -1 : +1;
15541 if (*s1)
15542 return 1;
15543 if (*s2)
15544 return -1;
15545 return 0;
15546}
15547
15548int
15549Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15550{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015551 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015552 for (; n != 0; n--) {
15553 u1 = *s1;
15554 u2 = *s2;
15555 if (u1 != u2)
15556 return (u1 < u2) ? -1 : +1;
15557 if (u1 == '\0')
15558 return 0;
15559 s1++;
15560 s2++;
15561 }
15562 return 0;
15563}
15564
15565Py_UNICODE*
15566Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15567{
15568 const Py_UNICODE *p;
15569 for (p = s; *p; p++)
15570 if (*p == c)
15571 return (Py_UNICODE*)p;
15572 return NULL;
15573}
15574
15575Py_UNICODE*
15576Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15577{
15578 const Py_UNICODE *p;
15579 p = s + Py_UNICODE_strlen(s);
15580 while (p != s) {
15581 p--;
15582 if (*p == c)
15583 return (Py_UNICODE*)p;
15584 }
15585 return NULL;
15586}
Victor Stinner331ea922010-08-10 16:37:20 +000015587
Victor Stinner71133ff2010-09-01 23:43:53 +000015588Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015589PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015590{
Victor Stinner577db2c2011-10-11 22:12:48 +020015591 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015592 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015594 if (!PyUnicode_Check(unicode)) {
15595 PyErr_BadArgument();
15596 return NULL;
15597 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015598 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015599 if (u == NULL)
15600 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015601 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015602 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015603 PyErr_NoMemory();
15604 return NULL;
15605 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015606 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015607 size *= sizeof(Py_UNICODE);
15608 copy = PyMem_Malloc(size);
15609 if (copy == NULL) {
15610 PyErr_NoMemory();
15611 return NULL;
15612 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015613 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015614 return copy;
15615}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015616
Georg Brandl66c221e2010-10-14 07:04:07 +000015617/* A _string module, to export formatter_parser and formatter_field_name_split
15618 to the string.Formatter class implemented in Python. */
15619
15620static PyMethodDef _string_methods[] = {
15621 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15622 METH_O, PyDoc_STR("split the argument as a field name")},
15623 {"formatter_parser", (PyCFunction) formatter_parser,
15624 METH_O, PyDoc_STR("parse the argument as a format string")},
15625 {NULL, NULL}
15626};
15627
15628static struct PyModuleDef _string_module = {
15629 PyModuleDef_HEAD_INIT,
15630 "_string",
15631 PyDoc_STR("string helper module"),
15632 0,
15633 _string_methods,
15634 NULL,
15635 NULL,
15636 NULL,
15637 NULL
15638};
15639
15640PyMODINIT_FUNC
15641PyInit__string(void)
15642{
15643 return PyModule_Create(&_string_module);
15644}
15645
15646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015647#ifdef __cplusplus
15648}
15649#endif