blob: e9fc6580383dde518807a30f3e342850e633bd36 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589}
1590
Victor Stinner17222162011-09-28 22:15:37 +02001591/* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599{
1600 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001601 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602
Victor Stinnerc53be962011-10-02 21:33:54 +02001603 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 }
1617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 return 0;
1634}
1635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001636int
1637_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644#endif
1645
Georg Brandl7597add2011-10-05 16:36:47 +02001646 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001654 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
1663 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 PyErr_NoMemory();
1667 return -1;
1668 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
Victor Stinner506f5922011-09-28 22:34:18 +02001695#if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001703#else
1704 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001707 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001708 PyErr_NoMemory();
1709 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 }
Victor Stinner506f5922011-09-28 22:34:18 +02001711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726#if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750 assert(num_surrogates == 0);
1751
Victor Stinnerc3c74152011-10-02 20:39:55 +02001752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 return 0;
1763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001766unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Walter Dörwald16807132007-05-25 13:52:07 +00001768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 case SSTATE_NOT_INTERNED:
1770 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001771
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001775 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
Victor Stinnerc0e77362017-09-12 16:09:44 -07001782 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001783
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 default:
1785 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001786 }
1787
Victor Stinner03490912011-10-03 23:45:12 +02001788 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001790 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001791 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001792 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1793 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001795 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796}
1797
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001798#ifdef Py_DEBUG
1799static int
1800unicode_is_singleton(PyObject *unicode)
1801{
1802 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1803 if (unicode == unicode_empty)
1804 return 1;
1805 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1806 {
1807 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1808 if (ch < 256 && unicode_latin1[ch] == unicode)
1809 return 1;
1810 }
1811 return 0;
1812}
1813#endif
1814
Alexander Belopolsky40018472011-02-26 01:02:56 +00001815static int
Victor Stinner488fa492011-12-12 00:01:39 +01001816unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001817{
Victor Stinner488fa492011-12-12 00:01:39 +01001818 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001819 if (Py_REFCNT(unicode) != 1)
1820 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001821 if (_PyUnicode_HASH(unicode) != -1)
1822 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001823 if (PyUnicode_CHECK_INTERNED(unicode))
1824 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001825 if (!PyUnicode_CheckExact(unicode))
1826 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001827#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001828 /* singleton refcount is greater than 1 */
1829 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001830#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001831 return 1;
1832}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001833
Victor Stinnerfe226c02011-10-03 03:52:20 +02001834static int
1835unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1836{
1837 PyObject *unicode;
1838 Py_ssize_t old_length;
1839
1840 assert(p_unicode != NULL);
1841 unicode = *p_unicode;
1842
1843 assert(unicode != NULL);
1844 assert(PyUnicode_Check(unicode));
1845 assert(0 <= length);
1846
Victor Stinner910337b2011-10-03 03:20:16 +02001847 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848 old_length = PyUnicode_WSTR_LENGTH(unicode);
1849 else
1850 old_length = PyUnicode_GET_LENGTH(unicode);
1851 if (old_length == length)
1852 return 0;
1853
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001854 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001855 _Py_INCREF_UNICODE_EMPTY();
1856 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001857 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001858 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001859 return 0;
1860 }
1861
Victor Stinner488fa492011-12-12 00:01:39 +01001862 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001863 PyObject *copy = resize_copy(unicode, length);
1864 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001866 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001868 }
1869
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001871 PyObject *new_unicode = resize_compact(unicode, length);
1872 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001874 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001875 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001876 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001877 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001878}
1879
Alexander Belopolsky40018472011-02-26 01:02:56 +00001880int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001881PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001882{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001883 PyObject *unicode;
1884 if (p_unicode == NULL) {
1885 PyErr_BadInternalCall();
1886 return -1;
1887 }
1888 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001889 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001890 {
1891 PyErr_BadInternalCall();
1892 return -1;
1893 }
1894 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001895}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001896
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001897/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001898
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001899 WARNING: The function doesn't copy the terminating null character and
1900 doesn't check the maximum character (may write a latin1 character in an
1901 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001902static void
1903unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1904 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001905{
1906 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1907 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001908 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001909
1910 switch (kind) {
1911 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001912 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001913#ifdef Py_DEBUG
1914 if (PyUnicode_IS_ASCII(unicode)) {
1915 Py_UCS4 maxchar = ucs1lib_find_max_char(
1916 (const Py_UCS1*)str,
1917 (const Py_UCS1*)str + len);
1918 assert(maxchar < 128);
1919 }
1920#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001921 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923 }
1924 case PyUnicode_2BYTE_KIND: {
1925 Py_UCS2 *start = (Py_UCS2 *)data + index;
1926 Py_UCS2 *ucs2 = start;
1927 assert(index <= PyUnicode_GET_LENGTH(unicode));
1928
Victor Stinner184252a2012-06-16 02:57:41 +02001929 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001930 *ucs2 = (Py_UCS2)*str;
1931
1932 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001933 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001934 }
1935 default: {
1936 Py_UCS4 *start = (Py_UCS4 *)data + index;
1937 Py_UCS4 *ucs4 = start;
1938 assert(kind == PyUnicode_4BYTE_KIND);
1939 assert(index <= PyUnicode_GET_LENGTH(unicode));
1940
Victor Stinner184252a2012-06-16 02:57:41 +02001941 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001942 *ucs4 = (Py_UCS4)*str;
1943
1944 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001945 }
1946 }
1947}
1948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949static PyObject*
1950get_latin1_char(unsigned char ch)
1951{
Victor Stinnera464fc12011-10-02 20:39:30 +02001952 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001954 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 if (!unicode)
1956 return NULL;
1957 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001958 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 unicode_latin1[ch] = unicode;
1960 }
1961 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001962 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963}
1964
Victor Stinner985a82a2014-01-03 12:53:47 +01001965static PyObject*
1966unicode_char(Py_UCS4 ch)
1967{
1968 PyObject *unicode;
1969
1970 assert(ch <= MAX_UNICODE);
1971
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001972 if (ch < 256)
1973 return get_latin1_char(ch);
1974
Victor Stinner985a82a2014-01-03 12:53:47 +01001975 unicode = PyUnicode_New(1, ch);
1976 if (unicode == NULL)
1977 return NULL;
1978 switch (PyUnicode_KIND(unicode)) {
1979 case PyUnicode_1BYTE_KIND:
1980 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1981 break;
1982 case PyUnicode_2BYTE_KIND:
1983 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1984 break;
1985 default:
1986 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1987 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1988 }
1989 assert(_PyUnicode_CheckConsistency(unicode, 1));
1990 return unicode;
1991}
1992
Alexander Belopolsky40018472011-02-26 01:02:56 +00001993PyObject *
1994PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001996 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 Py_UCS4 maxchar = 0;
1998 Py_ssize_t num_surrogates;
1999
2000 if (u == NULL)
2001 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002003 /* If the Unicode data is known at construction time, we can apply
2004 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002007 if (size == 0)
2008 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 /* Single character Unicode objects in the Latin-1 range are
2011 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002012 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return get_latin1_char((unsigned char)*u);
2014
2015 /* If not empty and not single character, copy the Unicode data
2016 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002017 if (find_maxchar_surrogates(u, u + size,
2018 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 return NULL;
2020
Victor Stinner8faf8212011-12-08 22:14:11 +01002021 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 if (!unicode)
2023 return NULL;
2024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 switch (PyUnicode_KIND(unicode)) {
2026 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002027 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2029 break;
2030 case PyUnicode_2BYTE_KIND:
2031#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002032 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002034 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2036#endif
2037 break;
2038 case PyUnicode_4BYTE_KIND:
2039#if SIZEOF_WCHAR_T == 2
2040 /* This is the only case which has to process surrogates, thus
2041 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002042 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043#else
2044 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002045 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046#endif
2047 break;
2048 default:
2049 assert(0 && "Impossible state");
2050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002052 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053}
2054
Alexander Belopolsky40018472011-02-26 01:02:56 +00002055PyObject *
2056PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002057{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002058 if (size < 0) {
2059 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002060 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002061 return NULL;
2062 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002063 if (u != NULL)
2064 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2065 else
2066 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002067}
2068
Alexander Belopolsky40018472011-02-26 01:02:56 +00002069PyObject *
2070PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002071{
2072 size_t size = strlen(u);
2073 if (size > PY_SSIZE_T_MAX) {
2074 PyErr_SetString(PyExc_OverflowError, "input too long");
2075 return NULL;
2076 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002077 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002078}
2079
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002080PyObject *
2081_PyUnicode_FromId(_Py_Identifier *id)
2082{
2083 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002084 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2085 strlen(id->string),
2086 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002087 if (!id->object)
2088 return NULL;
2089 PyUnicode_InternInPlace(&id->object);
2090 assert(!id->next);
2091 id->next = static_strings;
2092 static_strings = id;
2093 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002094 return id->object;
2095}
2096
2097void
2098_PyUnicode_ClearStaticStrings()
2099{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002100 _Py_Identifier *tmp, *s = static_strings;
2101 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002102 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002103 tmp = s->next;
2104 s->next = NULL;
2105 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002106 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002107 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002108}
2109
Benjamin Peterson0df54292012-03-26 14:50:32 -04002110/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002111
Victor Stinnerd3f08822012-05-29 12:57:52 +02002112PyObject*
2113_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002114{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002115 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002116 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002117 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002118#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002119 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002120#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002121 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002122 }
Victor Stinner785938e2011-12-11 20:09:03 +01002123 unicode = PyUnicode_New(size, 127);
2124 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002125 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002126 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2127 assert(_PyUnicode_CheckConsistency(unicode, 1));
2128 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002129}
2130
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002131static Py_UCS4
2132kind_maxchar_limit(unsigned int kind)
2133{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002134 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002135 case PyUnicode_1BYTE_KIND:
2136 return 0x80;
2137 case PyUnicode_2BYTE_KIND:
2138 return 0x100;
2139 case PyUnicode_4BYTE_KIND:
2140 return 0x10000;
2141 default:
2142 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002143 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002144 }
2145}
2146
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002147static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002148align_maxchar(Py_UCS4 maxchar)
2149{
2150 if (maxchar <= 127)
2151 return 127;
2152 else if (maxchar <= 255)
2153 return 255;
2154 else if (maxchar <= 65535)
2155 return 65535;
2156 else
2157 return MAX_UNICODE;
2158}
2159
Victor Stinner702c7342011-10-05 13:50:52 +02002160static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002161_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002164 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002165
Serhiy Storchaka678db842013-01-26 12:16:36 +02002166 if (size == 0)
2167 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002168 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002169 if (size == 1)
2170 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002171
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002172 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002173 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 if (!res)
2175 return NULL;
2176 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002177 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002179}
2180
Victor Stinnere57b1c02011-09-28 22:20:48 +02002181static PyObject*
2182_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183{
2184 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002185 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002186
Serhiy Storchaka678db842013-01-26 12:16:36 +02002187 if (size == 0)
2188 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002189 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002190 if (size == 1)
2191 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002192
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002193 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002194 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 if (!res)
2196 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002197 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002199 else {
2200 _PyUnicode_CONVERT_BYTES(
2201 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2202 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002203 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 return res;
2205}
2206
Victor Stinnere57b1c02011-09-28 22:20:48 +02002207static PyObject*
2208_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209{
2210 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002211 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002212
Serhiy Storchaka678db842013-01-26 12:16:36 +02002213 if (size == 0)
2214 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002215 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002216 if (size == 1)
2217 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002218
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002219 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002220 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 if (!res)
2222 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002223 if (max_char < 256)
2224 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2225 PyUnicode_1BYTE_DATA(res));
2226 else if (max_char < 0x10000)
2227 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2228 PyUnicode_2BYTE_DATA(res));
2229 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002231 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 return res;
2233}
2234
2235PyObject*
2236PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2237{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002238 if (size < 0) {
2239 PyErr_SetString(PyExc_ValueError, "size must be positive");
2240 return NULL;
2241 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002242 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002244 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002246 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002248 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002249 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002250 PyErr_SetString(PyExc_SystemError, "invalid kind");
2251 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253}
2254
Victor Stinnerece58de2012-04-23 23:36:38 +02002255Py_UCS4
2256_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2257{
2258 enum PyUnicode_Kind kind;
2259 void *startptr, *endptr;
2260
2261 assert(PyUnicode_IS_READY(unicode));
2262 assert(0 <= start);
2263 assert(end <= PyUnicode_GET_LENGTH(unicode));
2264 assert(start <= end);
2265
2266 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2267 return PyUnicode_MAX_CHAR_VALUE(unicode);
2268
2269 if (start == end)
2270 return 127;
2271
Victor Stinner94d558b2012-04-27 22:26:58 +02002272 if (PyUnicode_IS_ASCII(unicode))
2273 return 127;
2274
Victor Stinnerece58de2012-04-23 23:36:38 +02002275 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002276 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002277 endptr = (char *)startptr + end * kind;
2278 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002279 switch(kind) {
2280 case PyUnicode_1BYTE_KIND:
2281 return ucs1lib_find_max_char(startptr, endptr);
2282 case PyUnicode_2BYTE_KIND:
2283 return ucs2lib_find_max_char(startptr, endptr);
2284 case PyUnicode_4BYTE_KIND:
2285 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002286 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002287 assert(0);
2288 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002289 }
2290}
2291
Victor Stinner25a4b292011-10-06 12:31:55 +02002292/* Ensure that a string uses the most efficient storage, if it is not the
2293 case: create a new string with of the right kind. Write NULL into *p_unicode
2294 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002295static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002296unicode_adjust_maxchar(PyObject **p_unicode)
2297{
2298 PyObject *unicode, *copy;
2299 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002300 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002301 unsigned int kind;
2302
2303 assert(p_unicode != NULL);
2304 unicode = *p_unicode;
2305 assert(PyUnicode_IS_READY(unicode));
2306 if (PyUnicode_IS_ASCII(unicode))
2307 return;
2308
2309 len = PyUnicode_GET_LENGTH(unicode);
2310 kind = PyUnicode_KIND(unicode);
2311 if (kind == PyUnicode_1BYTE_KIND) {
2312 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs1lib_find_max_char(u, u + len);
2314 if (max_char >= 128)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
2317 else if (kind == PyUnicode_2BYTE_KIND) {
2318 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 max_char = ucs2lib_find_max_char(u, u + len);
2320 if (max_char >= 256)
2321 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002322 }
2323 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002324 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002325 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002326 max_char = ucs4lib_find_max_char(u, u + len);
2327 if (max_char >= 0x10000)
2328 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002330 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002331 if (copy != NULL)
2332 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002333 Py_DECREF(unicode);
2334 *p_unicode = copy;
2335}
2336
Victor Stinner034f6cf2011-09-30 02:26:44 +02002337PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002338_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002339{
Victor Stinner87af4f22011-11-21 23:03:47 +01002340 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002342
Victor Stinner034f6cf2011-09-30 02:26:44 +02002343 if (!PyUnicode_Check(unicode)) {
2344 PyErr_BadInternalCall();
2345 return NULL;
2346 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002347 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002348 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002349
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 length = PyUnicode_GET_LENGTH(unicode);
2351 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002352 if (!copy)
2353 return NULL;
2354 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2355
Christian Heimesf051e432016-09-13 20:22:02 +02002356 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002357 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002358 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002359 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002360}
2361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362
Victor Stinnerbc603d12011-10-02 01:00:40 +02002363/* Widen Unicode objects to larger buffers. Don't write terminating null
2364 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365
2366void*
2367_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2368{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002369 Py_ssize_t len;
2370 void *result;
2371 unsigned int skind;
2372
Benjamin Petersonbac79492012-01-14 13:34:47 -05002373 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002374 return NULL;
2375
2376 len = PyUnicode_GET_LENGTH(s);
2377 skind = PyUnicode_KIND(s);
2378 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002379 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 return NULL;
2381 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002382 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002383 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002384 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002385 if (!result)
2386 return PyErr_NoMemory();
2387 assert(skind == PyUnicode_1BYTE_KIND);
2388 _PyUnicode_CONVERT_BYTES(
2389 Py_UCS1, Py_UCS2,
2390 PyUnicode_1BYTE_DATA(s),
2391 PyUnicode_1BYTE_DATA(s) + len,
2392 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002395 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002396 if (!result)
2397 return PyErr_NoMemory();
2398 if (skind == PyUnicode_2BYTE_KIND) {
2399 _PyUnicode_CONVERT_BYTES(
2400 Py_UCS2, Py_UCS4,
2401 PyUnicode_2BYTE_DATA(s),
2402 PyUnicode_2BYTE_DATA(s) + len,
2403 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 else {
2406 assert(skind == PyUnicode_1BYTE_KIND);
2407 _PyUnicode_CONVERT_BYTES(
2408 Py_UCS1, Py_UCS4,
2409 PyUnicode_1BYTE_DATA(s),
2410 PyUnicode_1BYTE_DATA(s) + len,
2411 result);
2412 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 default:
2415 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 }
Victor Stinner01698042011-10-04 00:04:26 +02002417 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 return NULL;
2419}
2420
2421static Py_UCS4*
2422as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2423 int copy_null)
2424{
2425 int kind;
2426 void *data;
2427 Py_ssize_t len, targetlen;
2428 if (PyUnicode_READY(string) == -1)
2429 return NULL;
2430 kind = PyUnicode_KIND(string);
2431 data = PyUnicode_DATA(string);
2432 len = PyUnicode_GET_LENGTH(string);
2433 targetlen = len;
2434 if (copy_null)
2435 targetlen++;
2436 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002437 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 if (!target) {
2439 PyErr_NoMemory();
2440 return NULL;
2441 }
2442 }
2443 else {
2444 if (targetsize < targetlen) {
2445 PyErr_Format(PyExc_SystemError,
2446 "string is longer than the buffer");
2447 if (copy_null && 0 < targetsize)
2448 target[0] = 0;
2449 return NULL;
2450 }
2451 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002452 if (kind == PyUnicode_1BYTE_KIND) {
2453 Py_UCS1 *start = (Py_UCS1 *) data;
2454 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002456 else if (kind == PyUnicode_2BYTE_KIND) {
2457 Py_UCS2 *start = (Py_UCS2 *) data;
2458 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2459 }
2460 else {
2461 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002462 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (copy_null)
2465 target[len] = 0;
2466 return target;
2467}
2468
2469Py_UCS4*
2470PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2471 int copy_null)
2472{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002473 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 PyErr_BadInternalCall();
2475 return NULL;
2476 }
2477 return as_ucs4(string, target, targetsize, copy_null);
2478}
2479
2480Py_UCS4*
2481PyUnicode_AsUCS4Copy(PyObject *string)
2482{
2483 return as_ucs4(string, NULL, 0, 1);
2484}
2485
2486#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002487
Alexander Belopolsky40018472011-02-26 01:02:56 +00002488PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002489PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002493 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 PyErr_BadInternalCall();
2495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 }
2497
Martin v. Löwis790465f2008-04-05 20:41:37 +00002498 if (size == -1) {
2499 size = wcslen(w);
2500 }
2501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503}
2504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002506
Victor Stinner15a11362012-10-06 23:48:20 +02002507/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002508 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2509 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2510#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002511
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002512static int
2513unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2514 Py_ssize_t width, Py_ssize_t precision)
2515{
2516 Py_ssize_t length, fill, arglen;
2517 Py_UCS4 maxchar;
2518
2519 if (PyUnicode_READY(str) == -1)
2520 return -1;
2521
2522 length = PyUnicode_GET_LENGTH(str);
2523 if ((precision == -1 || precision >= length)
2524 && width <= length)
2525 return _PyUnicodeWriter_WriteStr(writer, str);
2526
2527 if (precision != -1)
2528 length = Py_MIN(precision, length);
2529
2530 arglen = Py_MAX(length, width);
2531 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2532 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2533 else
2534 maxchar = writer->maxchar;
2535
2536 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2537 return -1;
2538
2539 if (width > length) {
2540 fill = width - length;
2541 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2542 return -1;
2543 writer->pos += fill;
2544 }
2545
2546 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2547 str, 0, length);
2548 writer->pos += length;
2549 return 0;
2550}
2551
2552static int
2553unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2554 Py_ssize_t width, Py_ssize_t precision)
2555{
2556 /* UTF-8 */
2557 Py_ssize_t length;
2558 PyObject *unicode;
2559 int res;
2560
2561 length = strlen(str);
2562 if (precision != -1)
2563 length = Py_MIN(length, precision);
2564 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2565 if (unicode == NULL)
2566 return -1;
2567
2568 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2569 Py_DECREF(unicode);
2570 return res;
2571}
2572
Victor Stinner96865452011-03-01 23:44:09 +00002573static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002574unicode_fromformat_arg(_PyUnicodeWriter *writer,
2575 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002576{
Victor Stinnere215d962012-10-06 23:03:36 +02002577 const char *p;
2578 Py_ssize_t len;
2579 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 Py_ssize_t width;
2581 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002582 int longflag;
2583 int longlongflag;
2584 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002586
2587 p = f;
2588 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002589 zeropad = 0;
2590 if (*f == '0') {
2591 zeropad = 1;
2592 f++;
2593 }
Victor Stinner96865452011-03-01 23:44:09 +00002594
2595 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 width = -1;
2597 if (Py_ISDIGIT((unsigned)*f)) {
2598 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002599 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002600 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002602 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002603 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002604 return NULL;
2605 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002607 f++;
2608 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002609 }
2610 precision = -1;
2611 if (*f == '.') {
2612 f++;
2613 if (Py_ISDIGIT((unsigned)*f)) {
2614 precision = (*f - '0');
2615 f++;
2616 while (Py_ISDIGIT((unsigned)*f)) {
2617 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2618 PyErr_SetString(PyExc_ValueError,
2619 "precision too big");
2620 return NULL;
2621 }
2622 precision = (precision * 10) + (*f - '0');
2623 f++;
2624 }
2625 }
Victor Stinner96865452011-03-01 23:44:09 +00002626 if (*f == '%') {
2627 /* "%.3%s" => f points to "3" */
2628 f--;
2629 }
2630 }
2631 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002632 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002633 f--;
2634 }
Victor Stinner96865452011-03-01 23:44:09 +00002635
2636 /* Handle %ld, %lu, %lld and %llu. */
2637 longflag = 0;
2638 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002639 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002640 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002642 longflag = 1;
2643 ++f;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002646 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002647 longlongflag = 1;
2648 f += 2;
2649 }
Victor Stinner96865452011-03-01 23:44:09 +00002650 }
2651 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002652 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002653 size_tflag = 1;
2654 ++f;
2655 }
Victor Stinnere215d962012-10-06 23:03:36 +02002656
2657 if (f[1] == '\0')
2658 writer->overallocate = 0;
2659
2660 switch (*f) {
2661 case 'c':
2662 {
2663 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002664 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002665 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002666 "character argument not in range(0x110000)");
2667 return NULL;
2668 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002669 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002670 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002671 break;
2672 }
2673
2674 case 'i':
2675 case 'd':
2676 case 'u':
2677 case 'x':
2678 {
2679 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002680 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002681 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002682
2683 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002684 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002687 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002688 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002689 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002690 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002691 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002692 va_arg(*vargs, size_t));
2693 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002694 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002695 va_arg(*vargs, unsigned int));
2696 }
2697 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002698 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002699 }
2700 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002701 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002702 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002703 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002704 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002705 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002706 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002707 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002709 va_arg(*vargs, Py_ssize_t));
2710 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, int));
2713 }
2714 assert(len >= 0);
2715
Victor Stinnere215d962012-10-06 23:03:36 +02002716 if (precision < len)
2717 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002718
2719 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2721 return NULL;
2722
Victor Stinnere215d962012-10-06 23:03:36 +02002723 if (width > precision) {
2724 Py_UCS4 fillchar;
2725 fill = width - precision;
2726 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002727 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2728 return NULL;
2729 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002730 }
Victor Stinner15a11362012-10-06 23:48:20 +02002731 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002732 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002733 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2734 return NULL;
2735 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002736 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002737
Victor Stinner4a587072013-11-19 12:54:53 +01002738 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2739 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 break;
2741 }
2742
2743 case 'p':
2744 {
2745 char number[MAX_LONG_LONG_CHARS];
2746
2747 len = sprintf(number, "%p", va_arg(*vargs, void*));
2748 assert(len >= 0);
2749
2750 /* %p is ill-defined: ensure leading 0x. */
2751 if (number[1] == 'X')
2752 number[1] = 'x';
2753 else if (number[1] != 'x') {
2754 memmove(number + 2, number,
2755 strlen(number) + 1);
2756 number[0] = '0';
2757 number[1] = 'x';
2758 len += 2;
2759 }
2760
Victor Stinner4a587072013-11-19 12:54:53 +01002761 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002762 return NULL;
2763 break;
2764 }
2765
2766 case 's':
2767 {
2768 /* UTF-8 */
2769 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002770 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002771 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002772 break;
2773 }
2774
2775 case 'U':
2776 {
2777 PyObject *obj = va_arg(*vargs, PyObject *);
2778 assert(obj && _PyUnicode_CHECK(obj));
2779
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
2782 break;
2783 }
2784
2785 case 'V':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002789 if (obj) {
2790 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
2793 }
2794 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 assert(str != NULL);
2796 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002797 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002798 }
2799 break;
2800 }
2801
2802 case 'S':
2803 {
2804 PyObject *obj = va_arg(*vargs, PyObject *);
2805 PyObject *str;
2806 assert(obj);
2807 str = PyObject_Str(obj);
2808 if (!str)
2809 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002811 Py_DECREF(str);
2812 return NULL;
2813 }
2814 Py_DECREF(str);
2815 break;
2816 }
2817
2818 case 'R':
2819 {
2820 PyObject *obj = va_arg(*vargs, PyObject *);
2821 PyObject *repr;
2822 assert(obj);
2823 repr = PyObject_Repr(obj);
2824 if (!repr)
2825 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002826 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002827 Py_DECREF(repr);
2828 return NULL;
2829 }
2830 Py_DECREF(repr);
2831 break;
2832 }
2833
2834 case 'A':
2835 {
2836 PyObject *obj = va_arg(*vargs, PyObject *);
2837 PyObject *ascii;
2838 assert(obj);
2839 ascii = PyObject_ASCII(obj);
2840 if (!ascii)
2841 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002842 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002843 Py_DECREF(ascii);
2844 return NULL;
2845 }
2846 Py_DECREF(ascii);
2847 break;
2848 }
2849
2850 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002851 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002852 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002853 break;
2854
2855 default:
2856 /* if we stumble upon an unknown formatting code, copy the rest
2857 of the format string to the output string. (we cannot just
2858 skip the code, since there's no way to know what's in the
2859 argument list) */
2860 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002861 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
2863 f = p+len;
2864 return f;
2865 }
2866
2867 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002868 return f;
2869}
2870
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871PyObject *
2872PyUnicode_FromFormatV(const char *format, va_list vargs)
2873{
Victor Stinnere215d962012-10-06 23:03:36 +02002874 va_list vargs2;
2875 const char *f;
2876 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002877
Victor Stinner8f674cc2013-04-17 23:02:17 +02002878 _PyUnicodeWriter_Init(&writer);
2879 writer.min_length = strlen(format) + 100;
2880 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002881
Benjamin Peterson0c212142016-09-20 20:39:33 -07002882 // Copy varags to be able to pass a reference to a subfunction.
2883 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002884
2885 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002886 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002887 f = unicode_fromformat_arg(&writer, f, &vargs2);
2888 if (f == NULL)
2889 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002891 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002892 const char *p;
2893 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002894
Victor Stinnere215d962012-10-06 23:03:36 +02002895 p = f;
2896 do
2897 {
2898 if ((unsigned char)*p > 127) {
2899 PyErr_Format(PyExc_ValueError,
2900 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2901 "string, got a non-ASCII byte: 0x%02x",
2902 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002903 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002904 }
2905 p++;
2906 }
2907 while (*p != '\0' && *p != '%');
2908 len = p - f;
2909
2910 if (*p == '\0')
2911 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002912
2913 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002914 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002915
2916 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002918 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002919 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return _PyUnicodeWriter_Finish(&writer);
2921
2922 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002923 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002924 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926}
2927
Walter Dörwaldd2034312007-05-18 16:29:38 +00002928PyObject *
2929PyUnicode_FromFormat(const char *format, ...)
2930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 PyObject* ret;
2932 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002933
2934#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002937 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 ret = PyUnicode_FromFormatV(format, vargs);
2940 va_end(vargs);
2941 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002942}
2943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944#ifdef HAVE_WCHAR_H
2945
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2947 convert a Unicode object to a wide character string.
2948
Victor Stinnerd88d9832011-09-06 02:00:05 +02002949 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002950 character) required to convert the unicode object. Ignore size argument.
2951
Victor Stinnerd88d9832011-09-06 02:00:05 +02002952 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002953 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002954 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002955static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002956unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002957 wchar_t *w,
2958 Py_ssize_t size)
2959{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961 const wchar_t *wstr;
2962
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002963 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964 if (wstr == NULL)
2965 return -1;
2966
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002968 if (size > res)
2969 size = res + 1;
2970 else
2971 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002972 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002973 return res;
2974 }
2975 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002976 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002977}
2978
2979Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002980PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002981 wchar_t *w,
2982 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983{
2984 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 PyErr_BadInternalCall();
2986 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002988 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989}
2990
Victor Stinner137c34c2010-09-29 10:25:54 +00002991wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002992PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002993 Py_ssize_t *size)
2994{
2995 wchar_t* buffer;
2996 Py_ssize_t buflen;
2997
2998 if (unicode == NULL) {
2999 PyErr_BadInternalCall();
3000 return NULL;
3001 }
3002
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003003 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003004 if (buflen == -1)
3005 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003006 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003007 if (buffer == NULL) {
3008 PyErr_NoMemory();
3009 return NULL;
3010 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003011 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003012 if (buflen == -1) {
3013 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003015 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003016 if (size != NULL)
3017 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003018 return buffer;
3019}
3020
Serhiy Storchaka0edffa32017-06-27 21:08:58 +03003021wchar_t*
3022_PyUnicode_AsWideCharString(PyObject *unicode)
3023{
3024 const wchar_t *wstr;
3025 wchar_t *buffer;
3026 Py_ssize_t buflen;
3027
3028 if (unicode == NULL) {
3029 PyErr_BadInternalCall();
3030 return NULL;
3031 }
3032
3033 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3034 if (wstr == NULL) {
3035 return NULL;
3036 }
3037 if (wcslen(wstr) != (size_t)buflen) {
3038 PyErr_SetString(PyExc_ValueError,
3039 "embedded null character");
3040 return NULL;
3041 }
3042
3043 buffer = PyMem_NEW(wchar_t, buflen + 1);
3044 if (buffer == NULL) {
3045 PyErr_NoMemory();
3046 return NULL;
3047 }
3048 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
3049 return buffer;
3050}
3051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003052#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053
Alexander Belopolsky40018472011-02-26 01:02:56 +00003054PyObject *
3055PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003056{
Victor Stinner8faf8212011-12-08 22:14:11 +01003057 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 PyErr_SetString(PyExc_ValueError,
3059 "chr() arg not in range(0x110000)");
3060 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003061 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003062
Victor Stinner985a82a2014-01-03 12:53:47 +01003063 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003064}
3065
Alexander Belopolsky40018472011-02-26 01:02:56 +00003066PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003067PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003069 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003071 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003072 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003073 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 Py_INCREF(obj);
3075 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003076 }
3077 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 /* For a Unicode subtype that's not a Unicode object,
3079 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003080 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003081 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003082 PyErr_Format(PyExc_TypeError,
3083 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003084 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003085 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003086}
3087
Alexander Belopolsky40018472011-02-26 01:02:56 +00003088PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003089PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003090 const char *encoding,
3091 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003092{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003094 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003095
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 PyErr_BadInternalCall();
3098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003100
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003101 /* Decoding bytes objects is the most common case and should be fast */
3102 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003103 if (PyBytes_GET_SIZE(obj) == 0)
3104 _Py_RETURN_UNICODE_EMPTY();
3105 v = PyUnicode_Decode(
3106 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3107 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003108 return v;
3109 }
3110
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003111 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 PyErr_SetString(PyExc_TypeError,
3113 "decoding str is not supported");
3114 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003115 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003116
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003117 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3118 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3119 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003120 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003121 Py_TYPE(obj)->tp_name);
3122 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003123 }
Tim Petersced69f82003-09-16 20:30:58 +00003124
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003125 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003126 PyBuffer_Release(&buffer);
3127 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003129
Serhiy Storchaka05997252013-01-26 12:14:02 +02003130 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003131 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003132 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133}
3134
Victor Stinnerebe17e02016-10-12 13:57:45 +02003135/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3136 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3137 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003138int
3139_Py_normalize_encoding(const char *encoding,
3140 char *lower,
3141 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003143 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003144 char *l;
3145 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003146 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147
Victor Stinner942889a2016-09-05 15:40:10 -07003148 assert(encoding != NULL);
3149
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003150 e = encoding;
3151 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003152 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003153 punct = 0;
3154 while (1) {
3155 char c = *e;
3156 if (c == 0) {
3157 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
Victor Stinner942889a2016-09-05 15:40:10 -07003159
3160 if (Py_ISALNUM(c) || c == '.') {
3161 if (punct && l != lower) {
3162 if (l == l_end) {
3163 return 0;
3164 }
3165 *l++ = '_';
3166 }
3167 punct = 0;
3168
3169 if (l == l_end) {
3170 return 0;
3171 }
3172 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003173 }
3174 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003175 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003176 }
Victor Stinner942889a2016-09-05 15:40:10 -07003177
3178 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003179 }
3180 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003181 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003182}
3183
Alexander Belopolsky40018472011-02-26 01:02:56 +00003184PyObject *
3185PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003186 Py_ssize_t size,
3187 const char *encoding,
3188 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003189{
3190 PyObject *buffer = NULL, *unicode;
3191 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003192 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3193
3194 if (encoding == NULL) {
3195 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3196 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003197
Fred Drakee4315f52000-05-09 19:53:39 +00003198 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003199 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3200 char *lower = buflower;
3201
3202 /* Fast paths */
3203 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3204 lower += 3;
3205 if (*lower == '_') {
3206 /* Match "utf8" and "utf_8" */
3207 lower++;
3208 }
3209
3210 if (lower[0] == '8' && lower[1] == 0) {
3211 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3212 }
3213 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3214 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3215 }
3216 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3217 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3218 }
3219 }
3220 else {
3221 if (strcmp(lower, "ascii") == 0
3222 || strcmp(lower, "us_ascii") == 0) {
3223 return PyUnicode_DecodeASCII(s, size, errors);
3224 }
Steve Dowercc16be82016-09-08 10:35:16 -07003225 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003226 else if (strcmp(lower, "mbcs") == 0) {
3227 return PyUnicode_DecodeMBCS(s, size, errors);
3228 }
3229 #endif
3230 else if (strcmp(lower, "latin1") == 0
3231 || strcmp(lower, "latin_1") == 0
3232 || strcmp(lower, "iso_8859_1") == 0
3233 || strcmp(lower, "iso8859_1") == 0) {
3234 return PyUnicode_DecodeLatin1(s, size, errors);
3235 }
3236 }
Victor Stinner37296e82010-06-10 13:36:23 +00003237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238
3239 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003240 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003241 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003242 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003243 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 if (buffer == NULL)
3245 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003246 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 if (unicode == NULL)
3248 goto onError;
3249 if (!PyUnicode_Check(unicode)) {
3250 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003251 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3252 "use codecs.decode() to decode to arbitrary types",
3253 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003254 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 Py_DECREF(unicode);
3256 goto onError;
3257 }
3258 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003259 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003260
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_XDECREF(buffer);
3263 return NULL;
3264}
3265
Alexander Belopolsky40018472011-02-26 01:02:56 +00003266PyObject *
3267PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003268 const char *encoding,
3269 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003270{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003271 if (!PyUnicode_Check(unicode)) {
3272 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003273 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003274 }
3275
Serhiy Storchaka00939072016-10-27 21:05:49 +03003276 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3277 "PyUnicode_AsDecodedObject() is deprecated; "
3278 "use PyCodec_Decode() to decode from str", 1) < 0)
3279 return NULL;
3280
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003281 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003283
3284 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003285 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003286}
3287
Alexander Belopolsky40018472011-02-26 01:02:56 +00003288PyObject *
3289PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003290 const char *encoding,
3291 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292{
3293 PyObject *v;
3294
3295 if (!PyUnicode_Check(unicode)) {
3296 PyErr_BadArgument();
3297 goto onError;
3298 }
3299
Serhiy Storchaka00939072016-10-27 21:05:49 +03003300 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3301 "PyUnicode_AsDecodedUnicode() is deprecated; "
3302 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3303 return NULL;
3304
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003305 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003307
3308 /* Decode via the codec registry */
3309 v = PyCodec_Decode(unicode, encoding, errors);
3310 if (v == NULL)
3311 goto onError;
3312 if (!PyUnicode_Check(v)) {
3313 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003314 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3315 "use codecs.decode() to decode to arbitrary types",
3316 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003317 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003318 Py_DECREF(v);
3319 goto onError;
3320 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003321 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003322
Benjamin Peterson29060642009-01-31 22:14:21 +00003323 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003324 return NULL;
3325}
3326
Alexander Belopolsky40018472011-02-26 01:02:56 +00003327PyObject *
3328PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003329 Py_ssize_t size,
3330 const char *encoding,
3331 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332{
3333 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003334
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 unicode = PyUnicode_FromUnicode(s, size);
3336 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3339 Py_DECREF(unicode);
3340 return v;
3341}
3342
Alexander Belopolsky40018472011-02-26 01:02:56 +00003343PyObject *
3344PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003345 const char *encoding,
3346 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003347{
3348 PyObject *v;
3349
3350 if (!PyUnicode_Check(unicode)) {
3351 PyErr_BadArgument();
3352 goto onError;
3353 }
3354
Serhiy Storchaka00939072016-10-27 21:05:49 +03003355 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3356 "PyUnicode_AsEncodedObject() is deprecated; "
3357 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3358 "or PyCodec_Encode() for generic encoding", 1) < 0)
3359 return NULL;
3360
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003361 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003363
3364 /* Encode via the codec registry */
3365 v = PyCodec_Encode(unicode, encoding, errors);
3366 if (v == NULL)
3367 goto onError;
3368 return v;
3369
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003371 return NULL;
3372}
3373
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003374static size_t
3375wcstombs_errorpos(const wchar_t *wstr)
3376{
3377 size_t len;
3378#if SIZEOF_WCHAR_T == 2
3379 wchar_t buf[3];
3380#else
3381 wchar_t buf[2];
3382#endif
3383 char outbuf[MB_LEN_MAX];
3384 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003385
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386#if SIZEOF_WCHAR_T == 2
3387 buf[2] = 0;
3388#else
3389 buf[1] = 0;
3390#endif
3391 start = wstr;
3392 while (*wstr != L'\0')
3393 {
3394 previous = wstr;
3395#if SIZEOF_WCHAR_T == 2
3396 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3397 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3398 {
3399 buf[0] = wstr[0];
3400 buf[1] = wstr[1];
3401 wstr += 2;
3402 }
3403 else {
3404 buf[0] = *wstr;
3405 buf[1] = 0;
3406 wstr++;
3407 }
3408#else
3409 buf[0] = *wstr;
3410 wstr++;
3411#endif
3412 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003413 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003414 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003415 }
3416
3417 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 return 0;
3419}
3420
Victor Stinner1b579672011-12-17 05:47:23 +01003421static int
3422locale_error_handler(const char *errors, int *surrogateescape)
3423{
Victor Stinner50149202015-09-22 00:26:54 +02003424 _Py_error_handler error_handler = get_error_handler(errors);
3425 switch (error_handler)
3426 {
3427 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003428 *surrogateescape = 0;
3429 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003430 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003431 *surrogateescape = 1;
3432 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003433 default:
3434 PyErr_Format(PyExc_ValueError,
3435 "only 'strict' and 'surrogateescape' error handlers "
3436 "are supported, not '%s'",
3437 errors);
3438 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003439 }
Victor Stinner1b579672011-12-17 05:47:23 +01003440}
3441
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003443PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003444{
3445 Py_ssize_t wlen, wlen2;
3446 wchar_t *wstr;
3447 PyObject *bytes = NULL;
3448 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003449 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 PyObject *exc;
3451 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003452 int surrogateescape;
3453
3454 if (locale_error_handler(errors, &surrogateescape) < 0)
3455 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003456
3457 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3458 if (wstr == NULL)
3459 return NULL;
3460
3461 wlen2 = wcslen(wstr);
3462 if (wlen2 != wlen) {
3463 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003464 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003465 return NULL;
3466 }
3467
3468 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003469 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003470 char *str;
3471
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003472 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003473 if (str == NULL) {
3474 if (error_pos == (size_t)-1) {
3475 PyErr_NoMemory();
3476 PyMem_Free(wstr);
3477 return NULL;
3478 }
3479 else {
3480 goto encode_error;
3481 }
3482 }
3483 PyMem_Free(wstr);
3484
3485 bytes = PyBytes_FromString(str);
3486 PyMem_Free(str);
3487 }
3488 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003489 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003490 size_t len, len2;
3491
3492 len = wcstombs(NULL, wstr, 0);
3493 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003494 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003495 goto encode_error;
3496 }
3497
3498 bytes = PyBytes_FromStringAndSize(NULL, len);
3499 if (bytes == NULL) {
3500 PyMem_Free(wstr);
3501 return NULL;
3502 }
3503
3504 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3505 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003506 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003507 goto encode_error;
3508 }
3509 PyMem_Free(wstr);
3510 }
3511 return bytes;
3512
3513encode_error:
3514 errmsg = strerror(errno);
3515 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003516
3517 if (error_pos == (size_t)-1)
3518 error_pos = wcstombs_errorpos(wstr);
3519
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003520 PyMem_Free(wstr);
3521 Py_XDECREF(bytes);
3522
Victor Stinner2f197072011-12-17 07:08:30 +01003523 if (errmsg != NULL) {
3524 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003525 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003526 if (wstr != NULL) {
3527 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003528 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003529 } else
3530 errmsg = NULL;
3531 }
3532 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003533 reason = PyUnicode_FromString(
3534 "wcstombs() encountered an unencodable "
3535 "wide character");
3536 if (reason == NULL)
3537 return NULL;
3538
3539 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3540 "locale", unicode,
3541 (Py_ssize_t)error_pos,
3542 (Py_ssize_t)(error_pos+1),
3543 reason);
3544 Py_DECREF(reason);
3545 if (exc != NULL) {
3546 PyCodec_StrictErrors(exc);
3547 Py_XDECREF(exc);
3548 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003549 return NULL;
3550}
3551
Victor Stinnerad158722010-10-27 00:25:46 +00003552PyObject *
3553PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003554{
Steve Dowercc16be82016-09-08 10:35:16 -07003555#if defined(__APPLE__)
3556 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003557#else
Victor Stinner793b5312011-04-27 00:24:21 +02003558 PyInterpreterState *interp = PyThreadState_GET()->interp;
3559 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3560 cannot use it to encode and decode filenames before it is loaded. Load
3561 the Python codec requires to encode at least its own filename. Use the C
3562 version of the locale codec until the codec registry is initialized and
3563 the Python codec is loaded.
3564
3565 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3566 cannot only rely on it: check also interp->fscodec_initialized for
3567 subinterpreters. */
3568 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003569 return PyUnicode_AsEncodedString(unicode,
3570 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003571 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003572 }
3573 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003574 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003575 }
Victor Stinnerad158722010-10-27 00:25:46 +00003576#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003577}
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579PyObject *
3580PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 const char *encoding,
3582 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583{
3584 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003585 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (!PyUnicode_Check(unicode)) {
3588 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Fred Drakee4315f52000-05-09 19:53:39 +00003591
Victor Stinner942889a2016-09-05 15:40:10 -07003592 if (encoding == NULL) {
3593 return _PyUnicode_AsUTF8String(unicode, errors);
3594 }
3595
Fred Drakee4315f52000-05-09 19:53:39 +00003596 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003597 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3598 char *lower = buflower;
3599
3600 /* Fast paths */
3601 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3602 lower += 3;
3603 if (*lower == '_') {
3604 /* Match "utf8" and "utf_8" */
3605 lower++;
3606 }
3607
3608 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003610 }
3611 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3612 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3613 }
3614 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3615 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3616 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003617 }
Victor Stinner942889a2016-09-05 15:40:10 -07003618 else {
3619 if (strcmp(lower, "ascii") == 0
3620 || strcmp(lower, "us_ascii") == 0) {
3621 return _PyUnicode_AsASCIIString(unicode, errors);
3622 }
Steve Dowercc16be82016-09-08 10:35:16 -07003623#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003624 else if (strcmp(lower, "mbcs") == 0) {
3625 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3626 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003627#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003628 else if (strcmp(lower, "latin1") == 0 ||
3629 strcmp(lower, "latin_1") == 0 ||
3630 strcmp(lower, "iso_8859_1") == 0 ||
3631 strcmp(lower, "iso8859_1") == 0) {
3632 return _PyUnicode_AsLatin1String(unicode, errors);
3633 }
3634 }
Victor Stinner37296e82010-06-10 13:36:23 +00003635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
3637 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003638 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003640 return NULL;
3641
3642 /* The normal path */
3643 if (PyBytes_Check(v))
3644 return v;
3645
3646 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003648 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003649 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003650
3651 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "encoder %s returned bytearray instead of bytes; "
3653 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003654 encoding);
3655 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 Py_DECREF(v);
3657 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003660 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3661 Py_DECREF(v);
3662 return b;
3663 }
3664
3665 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003666 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3667 "use codecs.encode() to encode to arbitrary types",
3668 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003669 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003670 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003671 return NULL;
3672}
3673
Alexander Belopolsky40018472011-02-26 01:02:56 +00003674PyObject *
3675PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003676 const char *encoding,
3677 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003678{
3679 PyObject *v;
3680
3681 if (!PyUnicode_Check(unicode)) {
3682 PyErr_BadArgument();
3683 goto onError;
3684 }
3685
Serhiy Storchaka00939072016-10-27 21:05:49 +03003686 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3687 "PyUnicode_AsEncodedUnicode() is deprecated; "
3688 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3689 return NULL;
3690
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003691 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003693
3694 /* Encode via the codec registry */
3695 v = PyCodec_Encode(unicode, encoding, errors);
3696 if (v == NULL)
3697 goto onError;
3698 if (!PyUnicode_Check(v)) {
3699 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003700 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3701 "use codecs.encode() to encode to arbitrary types",
3702 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003703 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003704 Py_DECREF(v);
3705 goto onError;
3706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003708
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 return NULL;
3711}
3712
Victor Stinner2f197072011-12-17 07:08:30 +01003713static size_t
3714mbstowcs_errorpos(const char *str, size_t len)
3715{
3716#ifdef HAVE_MBRTOWC
3717 const char *start = str;
3718 mbstate_t mbs;
3719 size_t converted;
3720 wchar_t ch;
3721
3722 memset(&mbs, 0, sizeof mbs);
3723 while (len)
3724 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003725 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003726 if (converted == 0)
3727 /* Reached end of string */
3728 break;
3729 if (converted == (size_t)-1 || converted == (size_t)-2) {
3730 /* Conversion error or incomplete character */
3731 return str - start;
3732 }
3733 else {
3734 str += converted;
3735 len -= converted;
3736 }
3737 }
3738 /* failed to find the undecodable byte sequence */
3739 return 0;
3740#endif
3741 return 0;
3742}
3743
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003744PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003745PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003746 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003747{
3748 wchar_t smallbuf[256];
3749 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3750 wchar_t *wstr;
3751 size_t wlen, wlen2;
3752 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003753 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003754 size_t error_pos;
3755 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003756 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3757 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003758
3759 if (locale_error_handler(errors, &surrogateescape) < 0)
3760 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003761
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003762 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3763 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003764 return NULL;
3765 }
3766
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003767 if (surrogateescape) {
3768 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003769 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003770 if (wstr == NULL) {
3771 if (wlen == (size_t)-1)
3772 PyErr_NoMemory();
3773 else
3774 PyErr_SetFromErrno(PyExc_OSError);
3775 return NULL;
3776 }
3777
3778 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003779 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003780 }
3781 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003782 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003783#ifndef HAVE_BROKEN_MBSTOWCS
3784 wlen = mbstowcs(NULL, str, 0);
3785#else
3786 wlen = len;
3787#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003788 if (wlen == (size_t)-1)
3789 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003790 if (wlen+1 <= smallbuf_len) {
3791 wstr = smallbuf;
3792 }
3793 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003794 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003795 if (!wstr)
3796 return PyErr_NoMemory();
3797 }
3798
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003799 wlen2 = mbstowcs(wstr, str, wlen+1);
3800 if (wlen2 == (size_t)-1) {
3801 if (wstr != smallbuf)
3802 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003803 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003804 }
3805#ifdef HAVE_BROKEN_MBSTOWCS
3806 assert(wlen2 == wlen);
3807#endif
3808 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3809 if (wstr != smallbuf)
3810 PyMem_Free(wstr);
3811 }
3812 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003813
3814decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003815 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003816 errmsg = strerror(errno);
3817 assert(errmsg != NULL);
3818
3819 error_pos = mbstowcs_errorpos(str, len);
3820 if (errmsg != NULL) {
3821 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003822 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003823 if (wstr != NULL) {
3824 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003825 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003826 }
Victor Stinner2f197072011-12-17 07:08:30 +01003827 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003828 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003829 reason = PyUnicode_FromString(
3830 "mbstowcs() encountered an invalid multibyte sequence");
3831 if (reason == NULL)
3832 return NULL;
3833
3834 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3835 "locale", str, len,
3836 (Py_ssize_t)error_pos,
3837 (Py_ssize_t)(error_pos+1),
3838 reason);
3839 Py_DECREF(reason);
3840 if (exc != NULL) {
3841 PyCodec_StrictErrors(exc);
3842 Py_XDECREF(exc);
3843 }
3844 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003845}
3846
3847PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003848PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003849{
3850 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003851 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003852}
3853
3854
3855PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003856PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003857 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003858 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3859}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003860
Christian Heimes5894ba72007-11-04 11:43:14 +00003861PyObject*
3862PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3863{
Steve Dowercc16be82016-09-08 10:35:16 -07003864#if defined(__APPLE__)
3865 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003866#else
Victor Stinner793b5312011-04-27 00:24:21 +02003867 PyInterpreterState *interp = PyThreadState_GET()->interp;
3868 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3869 cannot use it to encode and decode filenames before it is loaded. Load
3870 the Python codec requires to encode at least its own filename. Use the C
3871 version of the locale codec until the codec registry is initialized and
3872 the Python codec is loaded.
3873
3874 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3875 cannot only rely on it: check also interp->fscodec_initialized for
3876 subinterpreters. */
3877 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003878 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003879 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003880 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003881 }
3882 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003883 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003884 }
Victor Stinnerad158722010-10-27 00:25:46 +00003885#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003886}
3887
Martin v. Löwis011e8422009-05-05 04:43:17 +00003888
3889int
3890PyUnicode_FSConverter(PyObject* arg, void* addr)
3891{
Brett Cannonec6ce872016-09-06 15:50:29 -07003892 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003893 PyObject *output = NULL;
3894 Py_ssize_t size;
3895 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003896 if (arg == NULL) {
3897 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003898 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003899 return 1;
3900 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003901 path = PyOS_FSPath(arg);
3902 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003903 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003904 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003905 if (PyBytes_Check(path)) {
3906 output = path;
3907 }
3908 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3909 output = PyUnicode_EncodeFSDefault(path);
3910 Py_DECREF(path);
3911 if (!output) {
3912 return 0;
3913 }
3914 assert(PyBytes_Check(output));
3915 }
3916
Victor Stinner0ea2a462010-04-30 00:22:08 +00003917 size = PyBytes_GET_SIZE(output);
3918 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003919 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003920 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003921 Py_DECREF(output);
3922 return 0;
3923 }
3924 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003925 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003926}
3927
3928
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003929int
3930PyUnicode_FSDecoder(PyObject* arg, void* addr)
3931{
Brett Cannona5711202016-09-06 19:36:01 -07003932 int is_buffer = 0;
3933 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003934 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003935 if (arg == NULL) {
3936 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka7a113a02017-04-20 22:55:06 +03003937 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003938 return 1;
3939 }
Brett Cannona5711202016-09-06 19:36:01 -07003940
3941 is_buffer = PyObject_CheckBuffer(arg);
3942 if (!is_buffer) {
3943 path = PyOS_FSPath(arg);
3944 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003945 return 0;
3946 }
Brett Cannona5711202016-09-06 19:36:01 -07003947 }
3948 else {
3949 path = arg;
3950 Py_INCREF(arg);
3951 }
3952
3953 if (PyUnicode_Check(path)) {
3954 if (PyUnicode_READY(path) == -1) {
3955 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003956 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003957 }
3958 output = path;
3959 }
3960 else if (PyBytes_Check(path) || is_buffer) {
3961 PyObject *path_bytes = NULL;
3962
3963 if (!PyBytes_Check(path) &&
3964 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3965 "path should be string, bytes, or os.PathLike, not %.200s",
3966 Py_TYPE(arg)->tp_name)) {
3967 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003968 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003969 }
3970 path_bytes = PyBytes_FromObject(path);
3971 Py_DECREF(path);
3972 if (!path_bytes) {
3973 return 0;
3974 }
3975 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3976 PyBytes_GET_SIZE(path_bytes));
3977 Py_DECREF(path_bytes);
3978 if (!output) {
3979 return 0;
3980 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003981 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003982 else {
3983 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003984 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003985 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003986 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003987 return 0;
3988 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003989 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003990 Py_DECREF(output);
3991 return 0;
3992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003994 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003995 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003996 Py_DECREF(output);
3997 return 0;
3998 }
3999 *(PyObject**)addr = output;
4000 return Py_CLEANUP_SUPPORTED;
4001}
4002
4003
Martin v. Löwis5b222132007-06-10 09:51:05 +00004004char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004006{
Christian Heimesf3863112007-11-22 07:46:41 +00004007 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004009 if (!PyUnicode_Check(unicode)) {
4010 PyErr_BadArgument();
4011 return NULL;
4012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004013 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004014 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004016 if (PyUnicode_UTF8(unicode) == NULL) {
4017 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004018 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 if (bytes == NULL)
4020 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004021 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4022 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004023 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 Py_DECREF(bytes);
4025 return NULL;
4026 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004027 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004028 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004029 PyBytes_AS_STRING(bytes),
4030 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 Py_DECREF(bytes);
4032 }
4033
4034 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004035 *psize = PyUnicode_UTF8_LENGTH(unicode);
4036 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004037}
4038
4039char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4043}
4044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045Py_UNICODE *
4046PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 const unsigned char *one_byte;
4049#if SIZEOF_WCHAR_T == 4
4050 const Py_UCS2 *two_bytes;
4051#else
4052 const Py_UCS4 *four_bytes;
4053 const Py_UCS4 *ucs4_end;
4054 Py_ssize_t num_surrogates;
4055#endif
4056 wchar_t *w;
4057 wchar_t *wchar_end;
4058
4059 if (!PyUnicode_Check(unicode)) {
4060 PyErr_BadArgument();
4061 return NULL;
4062 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004063 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004065 assert(_PyUnicode_KIND(unicode) != 0);
4066 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004068 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004070 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4071 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 num_surrogates = 0;
4073
4074 for (; four_bytes < ucs4_end; ++four_bytes) {
4075 if (*four_bytes > 0xFFFF)
4076 ++num_surrogates;
4077 }
4078
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004079 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4080 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4081 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 PyErr_NoMemory();
4083 return NULL;
4084 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004085 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004087 w = _PyUnicode_WSTR(unicode);
4088 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4089 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4091 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004092 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004094 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4095 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096 }
4097 else
4098 *w = *four_bytes;
4099
4100 if (w > wchar_end) {
4101 assert(0 && "Miscalculated string end");
4102 }
4103 }
4104 *w = 0;
4105#else
4106 /* sizeof(wchar_t) == 4 */
4107 Py_FatalError("Impossible unicode object state, wstr and str "
4108 "should share memory already.");
4109 return NULL;
4110#endif
4111 }
4112 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004113 if ((size_t)_PyUnicode_LENGTH(unicode) >
4114 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4115 PyErr_NoMemory();
4116 return NULL;
4117 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004118 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4119 (_PyUnicode_LENGTH(unicode) + 1));
4120 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004121 PyErr_NoMemory();
4122 return NULL;
4123 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004124 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4125 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4126 w = _PyUnicode_WSTR(unicode);
4127 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004129 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4130 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 for (; w < wchar_end; ++one_byte, ++w)
4132 *w = *one_byte;
4133 /* null-terminate the wstr */
4134 *w = 0;
4135 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004136 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004138 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139 for (; w < wchar_end; ++two_bytes, ++w)
4140 *w = *two_bytes;
4141 /* null-terminate the wstr */
4142 *w = 0;
4143#else
4144 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004145 PyObject_FREE(_PyUnicode_WSTR(unicode));
4146 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 Py_FatalError("Impossible unicode object state, wstr "
4148 "and str should share memory already.");
4149 return NULL;
4150#endif
4151 }
4152 else {
4153 assert(0 && "This should never happen.");
4154 }
4155 }
4156 }
4157 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004158 *size = PyUnicode_WSTR_LENGTH(unicode);
4159 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004160}
4161
Alexander Belopolsky40018472011-02-26 01:02:56 +00004162Py_UNICODE *
4163PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166}
4167
Serhiy Storchaka08349052017-06-28 09:27:35 +03004168const Py_UNICODE *
4169_PyUnicode_AsUnicode(PyObject *unicode)
4170{
4171 Py_ssize_t size;
4172 const Py_UNICODE *wstr;
4173
4174 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4175 if (wstr && wcslen(wstr) != (size_t)size) {
4176 PyErr_SetString(PyExc_ValueError, "embedded null character");
4177 return NULL;
4178 }
4179 return wstr;
4180}
4181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182
Alexander Belopolsky40018472011-02-26 01:02:56 +00004183Py_ssize_t
4184PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185{
4186 if (!PyUnicode_Check(unicode)) {
4187 PyErr_BadArgument();
4188 goto onError;
4189 }
4190 return PyUnicode_GET_SIZE(unicode);
4191
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 return -1;
4194}
4195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196Py_ssize_t
4197PyUnicode_GetLength(PyObject *unicode)
4198{
Victor Stinner07621332012-06-16 04:53:46 +02004199 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004200 PyErr_BadArgument();
4201 return -1;
4202 }
Victor Stinner07621332012-06-16 04:53:46 +02004203 if (PyUnicode_READY(unicode) == -1)
4204 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 return PyUnicode_GET_LENGTH(unicode);
4206}
4207
4208Py_UCS4
4209PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4210{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004211 void *data;
4212 int kind;
4213
Serhiy Storchakaddb536b2017-09-08 10:43:54 +03004214 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004215 PyErr_BadArgument();
4216 return (Py_UCS4)-1;
4217 }
Serhiy Storchakaddb536b2017-09-08 10:43:54 +03004218 if (PyUnicode_READY(unicode) == -1) {
4219 return (Py_UCS4)-1;
4220 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004221 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004222 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004223 return (Py_UCS4)-1;
4224 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004225 data = PyUnicode_DATA(unicode);
4226 kind = PyUnicode_KIND(unicode);
4227 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228}
4229
4230int
4231PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4232{
4233 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004234 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235 return -1;
4236 }
Victor Stinner488fa492011-12-12 00:01:39 +01004237 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004238 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004239 PyErr_SetString(PyExc_IndexError, "string index out of range");
4240 return -1;
4241 }
Victor Stinner488fa492011-12-12 00:01:39 +01004242 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004243 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004244 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4245 PyErr_SetString(PyExc_ValueError, "character out of range");
4246 return -1;
4247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4249 index, ch);
4250 return 0;
4251}
4252
Alexander Belopolsky40018472011-02-26 01:02:56 +00004253const char *
4254PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004255{
Victor Stinner42cb4622010-09-01 19:39:01 +00004256 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004257}
4258
Victor Stinner554f3f02010-06-16 23:33:54 +00004259/* create or adjust a UnicodeDecodeError */
4260static void
4261make_decode_exception(PyObject **exceptionObject,
4262 const char *encoding,
4263 const char *input, Py_ssize_t length,
4264 Py_ssize_t startpos, Py_ssize_t endpos,
4265 const char *reason)
4266{
4267 if (*exceptionObject == NULL) {
4268 *exceptionObject = PyUnicodeDecodeError_Create(
4269 encoding, input, length, startpos, endpos, reason);
4270 }
4271 else {
4272 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4273 goto onError;
4274 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4275 goto onError;
4276 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4277 goto onError;
4278 }
4279 return;
4280
4281onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004282 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004283}
4284
Steve Dowercc16be82016-09-08 10:35:16 -07004285#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286/* error handling callback helper:
4287 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004288 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 and adjust various state variables.
4290 return 0 on success, -1 on error
4291*/
4292
Alexander Belopolsky40018472011-02-26 01:02:56 +00004293static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294unicode_decode_call_errorhandler_wchar(
4295 const char *errors, PyObject **errorHandler,
4296 const char *encoding, const char *reason,
4297 const char **input, const char **inend, Py_ssize_t *startinpos,
4298 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4299 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004301 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302
4303 PyObject *restuple = NULL;
4304 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004305 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004306 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004307 Py_ssize_t requiredsize;
4308 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004309 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 wchar_t *repwstr;
4311 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004313 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4314 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 *errorHandler = PyCodec_LookupError(errors);
4318 if (*errorHandler == NULL)
4319 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 }
4321
Victor Stinner554f3f02010-06-16 23:33:54 +00004322 make_decode_exception(exceptionObject,
4323 encoding,
4324 *input, *inend - *input,
4325 *startinpos, *endinpos,
4326 reason);
4327 if (*exceptionObject == NULL)
4328 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329
4330 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4331 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004334 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 }
4337 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339
4340 /* Copy back the bytes variables, which might have been modified by the
4341 callback */
4342 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4343 if (!inputobj)
4344 goto onError;
4345 if (!PyBytes_Check(inputobj)) {
4346 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4347 }
4348 *input = PyBytes_AS_STRING(inputobj);
4349 insize = PyBytes_GET_SIZE(inputobj);
4350 *inend = *input + insize;
4351 /* we can DECREF safely, as the exception has another reference,
4352 so the object won't go away. */
4353 Py_DECREF(inputobj);
4354
4355 if (newpos<0)
4356 newpos = insize+newpos;
4357 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004358 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004359 goto onError;
4360 }
4361
4362 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4363 if (repwstr == NULL)
4364 goto onError;
4365 /* need more space? (at least enough for what we
4366 have+the replacement+the rest of the string (starting
4367 at the new input position), so we won't have to check space
4368 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004369 requiredsize = *outpos;
4370 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4371 goto overflow;
4372 requiredsize += repwlen;
4373 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4374 goto overflow;
4375 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004376 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004377 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004378 requiredsize = 2*outsize;
4379 if (unicode_resize(output, requiredsize) < 0)
4380 goto onError;
4381 }
4382 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4383 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 *endinpos = newpos;
4385 *inptr = *input + newpos;
4386
4387 /* we made it! */
4388 Py_XDECREF(restuple);
4389 return 0;
4390
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004391 overflow:
4392 PyErr_SetString(PyExc_OverflowError,
4393 "decoded result is too long for a Python string");
4394
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395 onError:
4396 Py_XDECREF(restuple);
4397 return -1;
4398}
Steve Dowercc16be82016-09-08 10:35:16 -07004399#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400
4401static int
4402unicode_decode_call_errorhandler_writer(
4403 const char *errors, PyObject **errorHandler,
4404 const char *encoding, const char *reason,
4405 const char **input, const char **inend, Py_ssize_t *startinpos,
4406 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4407 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4408{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004409 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410
4411 PyObject *restuple = NULL;
4412 PyObject *repunicode = NULL;
4413 Py_ssize_t insize;
4414 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004415 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 PyObject *inputobj = NULL;
4417
4418 if (*errorHandler == NULL) {
4419 *errorHandler = PyCodec_LookupError(errors);
4420 if (*errorHandler == NULL)
4421 goto onError;
4422 }
4423
4424 make_decode_exception(exceptionObject,
4425 encoding,
4426 *input, *inend - *input,
4427 *startinpos, *endinpos,
4428 reason);
4429 if (*exceptionObject == NULL)
4430 goto onError;
4431
4432 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4433 if (restuple == NULL)
4434 goto onError;
4435 if (!PyTuple_Check(restuple)) {
4436 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4437 goto onError;
4438 }
4439 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004440 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004441
4442 /* Copy back the bytes variables, which might have been modified by the
4443 callback */
4444 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4445 if (!inputobj)
4446 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004447 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004449 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004450 *input = PyBytes_AS_STRING(inputobj);
4451 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004452 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004453 /* we can DECREF safely, as the exception has another reference,
4454 so the object won't go away. */
4455 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004459 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004460 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463
Victor Stinner8f674cc2013-04-17 23:02:17 +02004464 if (PyUnicode_READY(repunicode) < 0)
4465 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004466 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004467 if (replen > 1) {
4468 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004469 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004470 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4471 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4472 goto onError;
4473 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004474 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004475 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004476
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004478 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004481 Py_XDECREF(restuple);
4482 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004486 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487}
4488
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489/* --- UTF-7 Codec -------------------------------------------------------- */
4490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491/* See RFC2152 for details. We encode conservatively and decode liberally. */
4492
4493/* Three simple macros defining base-64. */
4494
4495/* Is c a base-64 character? */
4496
4497#define IS_BASE64(c) \
4498 (((c) >= 'A' && (c) <= 'Z') || \
4499 ((c) >= 'a' && (c) <= 'z') || \
4500 ((c) >= '0' && (c) <= '9') || \
4501 (c) == '+' || (c) == '/')
4502
4503/* given that c is a base-64 character, what is its base-64 value? */
4504
4505#define FROM_BASE64(c) \
4506 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4507 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4508 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4509 (c) == '+' ? 62 : 63)
4510
4511/* What is the base-64 character of the bottom 6 bits of n? */
4512
4513#define TO_BASE64(n) \
4514 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4515
4516/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4517 * decoded as itself. We are permissive on decoding; the only ASCII
4518 * byte not decoding to itself is the + which begins a base64
4519 * string. */
4520
4521#define DECODE_DIRECT(c) \
4522 ((c) <= 127 && (c) != '+')
4523
4524/* The UTF-7 encoder treats ASCII characters differently according to
4525 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4526 * the above). See RFC2152. This array identifies these different
4527 * sets:
4528 * 0 : "Set D"
4529 * alphanumeric and '(),-./:?
4530 * 1 : "Set O"
4531 * !"#$%&*;<=>@[]^_`{|}
4532 * 2 : "whitespace"
4533 * ht nl cr sp
4534 * 3 : special (must be base64 encoded)
4535 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4536 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537
Tim Petersced69f82003-09-16 20:30:58 +00004538static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539char utf7_category[128] = {
4540/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4541 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4542/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4543 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4544/* sp ! " # $ % & ' ( ) * + , - . / */
4545 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4546/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4548/* @ A B C D E F G H I J K L M N O */
4549 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4550/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4552/* ` a b c d e f g h i j k l m n o */
4553 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4554/* p q r s t u v w x y z { | } ~ del */
4555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556};
4557
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558/* ENCODE_DIRECT: this character should be encoded as itself. The
4559 * answer depends on whether we are encoding set O as itself, and also
4560 * on whether we are encoding whitespace as itself. RFC2152 makes it
4561 * clear that the answers to these questions vary between
4562 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004563
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564#define ENCODE_DIRECT(c, directO, directWS) \
4565 ((c) < 128 && (c) > 0 && \
4566 ((utf7_category[(c)] == 0) || \
4567 (directWS && (utf7_category[(c)] == 2)) || \
4568 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Alexander Belopolsky40018472011-02-26 01:02:56 +00004570PyObject *
4571PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004572 Py_ssize_t size,
4573 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004574{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004575 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4576}
4577
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578/* The decoder. The only state we preserve is our read position,
4579 * i.e. how many characters we have consumed. So if we end in the
4580 * middle of a shift sequence we have to back off the read position
4581 * and the output to the beginning of the sequence, otherwise we lose
4582 * all the shift state (seen bits, number of bits seen, high
4583 * surrogate). */
4584
Alexander Belopolsky40018472011-02-26 01:02:56 +00004585PyObject *
4586PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004587 Py_ssize_t size,
4588 const char *errors,
4589 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 Py_ssize_t startinpos;
4593 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004595 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004596 const char *errmsg = "";
4597 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 unsigned int base64bits = 0;
4600 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004601 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 PyObject *errorHandler = NULL;
4603 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004605 if (size == 0) {
4606 if (consumed)
4607 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004608 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004609 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004611 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004612 _PyUnicodeWriter_Init(&writer);
4613 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004614
4615 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004616 e = s + size;
4617
4618 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004619 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004621 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 if (inShift) { /* in a base-64 section */
4624 if (IS_BASE64(ch)) { /* consume a base-64 character */
4625 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4626 base64bits += 6;
4627 s++;
4628 if (base64bits >= 16) {
4629 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004630 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 base64bits -= 16;
4632 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004633 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 if (surrogate) {
4635 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004636 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4637 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004638 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004639 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004641 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 }
4643 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004645 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 }
4648 }
Victor Stinner551ac952011-11-29 22:58:13 +01004649 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 /* first surrogate */
4651 surrogate = outCh;
4652 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004654 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004655 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 }
4657 }
4658 }
4659 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 if (base64bits > 0) { /* left-over bits */
4662 if (base64bits >= 6) {
4663 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004664 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 errmsg = "partial character in shift sequence";
4666 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 else {
4669 /* Some bits remain; they should be zero */
4670 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004671 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 errmsg = "non-zero padding bits in shift sequence";
4673 goto utf7Error;
4674 }
4675 }
4676 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004677 if (surrogate && DECODE_DIRECT(ch)) {
4678 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4679 goto onError;
4680 }
4681 surrogate = 0;
4682 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683 /* '-' is absorbed; other terminating
4684 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004685 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004686 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687 }
4688 }
4689 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 s++; /* consume '+' */
4692 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004694 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004695 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 }
4697 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004698 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004699 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004700 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004702 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703 }
4704 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004707 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004708 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 else {
4711 startinpos = s-starts;
4712 s++;
4713 errmsg = "unexpected special character";
4714 goto utf7Error;
4715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 errors, &errorHandler,
4721 "utf7", errmsg,
4722 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725 }
4726
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 /* end of string */
4728
4729 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4730 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004731 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004732 if (surrogate ||
4733 (base64bits >= 6) ||
4734 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 errors, &errorHandler,
4738 "utf7", "unterminated shift sequence",
4739 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004740 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 goto onError;
4742 if (s < e)
4743 goto restart;
4744 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746
4747 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004748 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004749 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004750 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004751 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004752 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004753 writer.kind, writer.data, shiftOutStart);
4754 Py_XDECREF(errorHandler);
4755 Py_XDECREF(exc);
4756 _PyUnicodeWriter_Dealloc(&writer);
4757 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004758 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004759 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 }
4761 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004762 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004764 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004768 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 Py_XDECREF(errorHandler);
4772 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004773 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774 return NULL;
4775}
4776
4777
Alexander Belopolsky40018472011-02-26 01:02:56 +00004778PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004779_PyUnicode_EncodeUTF7(PyObject *str,
4780 int base64SetO,
4781 int base64WhiteSpace,
4782 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004783{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004784 int kind;
4785 void *data;
4786 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004787 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004789 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 unsigned int base64bits = 0;
4791 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004792 char * out;
4793 char * start;
4794
Benjamin Petersonbac79492012-01-14 13:34:47 -05004795 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004796 return NULL;
4797 kind = PyUnicode_KIND(str);
4798 data = PyUnicode_DATA(str);
4799 len = PyUnicode_GET_LENGTH(str);
4800
4801 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004803
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004804 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004805 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004806 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004807 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004808 if (v == NULL)
4809 return NULL;
4810
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004811 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004812 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004813 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004814
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 if (inShift) {
4816 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4817 /* shifting out */
4818 if (base64bits) { /* output remaining bits */
4819 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4820 base64buffer = 0;
4821 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004822 }
4823 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004824 /* Characters not in the BASE64 set implicitly unshift the sequence
4825 so no '-' is required, except if the character is itself a '-' */
4826 if (IS_BASE64(ch) || ch == '-') {
4827 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004829 *out++ = (char) ch;
4830 }
4831 else {
4832 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004833 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004834 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 else { /* not in a shift sequence */
4836 if (ch == '+') {
4837 *out++ = '+';
4838 *out++ = '-';
4839 }
4840 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4841 *out++ = (char) ch;
4842 }
4843 else {
4844 *out++ = '+';
4845 inShift = 1;
4846 goto encode_char;
4847 }
4848 }
4849 continue;
4850encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004851 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004852 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004853
Antoine Pitrou244651a2009-05-04 18:56:13 +00004854 /* code first surrogate */
4855 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004856 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857 while (base64bits >= 6) {
4858 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4859 base64bits -= 6;
4860 }
4861 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004862 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004864 base64bits += 16;
4865 base64buffer = (base64buffer << 16) | ch;
4866 while (base64bits >= 6) {
4867 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4868 base64bits -= 6;
4869 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004870 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004871 if (base64bits)
4872 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4873 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004875 if (_PyBytes_Resize(&v, out - start) < 0)
4876 return NULL;
4877 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004878}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004879PyObject *
4880PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4881 Py_ssize_t size,
4882 int base64SetO,
4883 int base64WhiteSpace,
4884 const char *errors)
4885{
4886 PyObject *result;
4887 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4888 if (tmp == NULL)
4889 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004890 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004891 base64WhiteSpace, errors);
4892 Py_DECREF(tmp);
4893 return result;
4894}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004895
Antoine Pitrou244651a2009-05-04 18:56:13 +00004896#undef IS_BASE64
4897#undef FROM_BASE64
4898#undef TO_BASE64
4899#undef DECODE_DIRECT
4900#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004901
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902/* --- UTF-8 Codec -------------------------------------------------------- */
4903
Alexander Belopolsky40018472011-02-26 01:02:56 +00004904PyObject *
4905PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004906 Py_ssize_t size,
4907 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908{
Walter Dörwald69652032004-09-07 20:24:22 +00004909 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4910}
4911
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912#include "stringlib/asciilib.h"
4913#include "stringlib/codecs.h"
4914#include "stringlib/undef.h"
4915
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004916#include "stringlib/ucs1lib.h"
4917#include "stringlib/codecs.h"
4918#include "stringlib/undef.h"
4919
4920#include "stringlib/ucs2lib.h"
4921#include "stringlib/codecs.h"
4922#include "stringlib/undef.h"
4923
4924#include "stringlib/ucs4lib.h"
4925#include "stringlib/codecs.h"
4926#include "stringlib/undef.h"
4927
Antoine Pitrouab868312009-01-10 15:40:25 +00004928/* Mask to quickly check whether a C 'long' contains a
4929 non-ASCII, UTF8-encoded char. */
4930#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004931# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004932#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004933# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004934#else
4935# error C 'long' size should be either 4 or 8!
4936#endif
4937
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938static Py_ssize_t
4939ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004940{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004942 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004943
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004944 /*
4945 * Issue #17237: m68k is a bit different from most architectures in
4946 * that objects do not use "natural alignment" - for example, int and
4947 * long are only aligned at 2-byte boundaries. Therefore the assert()
4948 * won't work; also, tests have shown that skipping the "optimised
4949 * version" will even speed up m68k.
4950 */
4951#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004953 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4954 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 /* Fast path, see in STRINGLIB(utf8_decode) for
4956 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004957 /* Help allocation */
4958 const char *_p = p;
4959 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 while (_p < aligned_end) {
4961 unsigned long value = *(const unsigned long *) _p;
4962 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 *((unsigned long *)q) = value;
4965 _p += SIZEOF_LONG;
4966 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004967 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 p = _p;
4969 while (p < end) {
4970 if ((unsigned char)*p & 0x80)
4971 break;
4972 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004976#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004977#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 while (p < end) {
4979 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4980 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004981 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004982 /* Help allocation */
4983 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 while (_p < aligned_end) {
4985 unsigned long value = *(unsigned long *) _p;
4986 if (value & ASCII_CHAR_MASK)
4987 break;
4988 _p += SIZEOF_LONG;
4989 }
4990 p = _p;
4991 if (_p == end)
4992 break;
4993 }
4994 if ((unsigned char)*p & 0x80)
4995 break;
4996 ++p;
4997 }
4998 memcpy(dest, start, p - start);
4999 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000}
Antoine Pitrouab868312009-01-10 15:40:25 +00005001
Victor Stinner785938e2011-12-11 20:09:03 +01005002PyObject *
5003PyUnicode_DecodeUTF8Stateful(const char *s,
5004 Py_ssize_t size,
5005 const char *errors,
5006 Py_ssize_t *consumed)
5007{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01005009 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011
5012 Py_ssize_t startinpos;
5013 Py_ssize_t endinpos;
5014 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02005015 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02005017 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01005018
5019 if (size == 0) {
5020 if (consumed)
5021 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005022 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005023 }
5024
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5026 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01005027 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 *consumed = 1;
5029 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005030 }
5031
Victor Stinner8f674cc2013-04-17 23:02:17 +02005032 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005033 writer.min_length = size;
5034 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005035 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01005036
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 writer.pos = ascii_decode(s, end, writer.data);
5038 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 while (s < end) {
5040 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005041 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005042
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005044 if (PyUnicode_IS_ASCII(writer.buffer))
5045 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005046 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005047 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005048 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005049 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050 } else {
5051 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005052 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005053 }
5054
5055 switch (ch) {
5056 case 0:
5057 if (s == end || consumed)
5058 goto End;
5059 errmsg = "unexpected end of data";
5060 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005061 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 break;
5063 case 1:
5064 errmsg = "invalid start byte";
5065 startinpos = s - starts;
5066 endinpos = startinpos + 1;
5067 break;
5068 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005069 case 3:
5070 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071 errmsg = "invalid continuation byte";
5072 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005073 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 break;
5075 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005076 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 goto onError;
5078 continue;
5079 }
5080
Victor Stinner1d65d912015-10-05 13:43:50 +02005081 if (error_handler == _Py_ERROR_UNKNOWN)
5082 error_handler = get_error_handler(errors);
5083
5084 switch (error_handler) {
5085 case _Py_ERROR_IGNORE:
5086 s += (endinpos - startinpos);
5087 break;
5088
5089 case _Py_ERROR_REPLACE:
5090 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5091 goto onError;
5092 s += (endinpos - startinpos);
5093 break;
5094
5095 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005096 {
5097 Py_ssize_t i;
5098
Victor Stinner1d65d912015-10-05 13:43:50 +02005099 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5100 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005101 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005102 ch = (Py_UCS4)(unsigned char)(starts[i]);
5103 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5104 ch + 0xdc00);
5105 writer.pos++;
5106 }
5107 s += (endinpos - startinpos);
5108 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005109 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005110
5111 default:
5112 if (unicode_decode_call_errorhandler_writer(
5113 errors, &error_handler_obj,
5114 "utf-8", errmsg,
5115 &starts, &end, &startinpos, &endinpos, &exc, &s,
5116 &writer))
5117 goto onError;
5118 }
Victor Stinner785938e2011-12-11 20:09:03 +01005119 }
5120
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005121End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122 if (consumed)
5123 *consumed = s - starts;
5124
Victor Stinner1d65d912015-10-05 13:43:50 +02005125 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005127 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128
5129onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005130 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005132 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005134}
5135
Xavier de Gaye76febd02016-12-15 20:59:58 +01005136#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137
5138/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005139 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005140
5141 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005142 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005143
5144wchar_t*
5145_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5146{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005147 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 wchar_t *unicode;
5149 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150
5151 /* Note: size will always be longer than the resulting Unicode
5152 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005153 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005154 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005155 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005156 if (!unicode)
5157 return NULL;
5158
5159 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005160 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005161 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005162 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005163 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005164#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005166#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005167 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005168#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005169 if (ch > 0xFF) {
5170#if SIZEOF_WCHAR_T == 4
5171 assert(0);
5172#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005173 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 /* compute and append the two surrogates: */
5175 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5176 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5177#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005178 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 else {
5180 if (!ch && s == e)
5181 break;
5182 /* surrogateescape */
5183 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5184 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005185 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005187 return unicode;
5188}
5189
Xavier de Gaye76febd02016-12-15 20:59:58 +01005190#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005192/* Primary internal function which creates utf8 encoded bytes objects.
5193
5194 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005195 and allocate exactly as much space needed at the end. Else allocate the
5196 maximum possible needed (4 result bytes per Unicode character), and return
5197 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005198*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005199PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005200_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201{
Victor Stinner6099a032011-12-18 14:22:26 +01005202 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005203 void *data;
5204 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206 if (!PyUnicode_Check(unicode)) {
5207 PyErr_BadArgument();
5208 return NULL;
5209 }
5210
5211 if (PyUnicode_READY(unicode) == -1)
5212 return NULL;
5213
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005214 if (PyUnicode_UTF8(unicode))
5215 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5216 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005217
5218 kind = PyUnicode_KIND(unicode);
5219 data = PyUnicode_DATA(unicode);
5220 size = PyUnicode_GET_LENGTH(unicode);
5221
Benjamin Petersonead6b532011-12-20 17:23:42 -06005222 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005223 default:
5224 assert(0);
5225 case PyUnicode_1BYTE_KIND:
5226 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5227 assert(!PyUnicode_IS_ASCII(unicode));
5228 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5229 case PyUnicode_2BYTE_KIND:
5230 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5231 case PyUnicode_4BYTE_KIND:
5232 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234}
5235
Alexander Belopolsky40018472011-02-26 01:02:56 +00005236PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005237PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5238 Py_ssize_t size,
5239 const char *errors)
5240{
5241 PyObject *v, *unicode;
5242
5243 unicode = PyUnicode_FromUnicode(s, size);
5244 if (unicode == NULL)
5245 return NULL;
5246 v = _PyUnicode_AsUTF8String(unicode, errors);
5247 Py_DECREF(unicode);
5248 return v;
5249}
5250
5251PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005252PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005254 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255}
5256
Walter Dörwald41980ca2007-08-16 21:55:45 +00005257/* --- UTF-32 Codec ------------------------------------------------------- */
5258
5259PyObject *
5260PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 Py_ssize_t size,
5262 const char *errors,
5263 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264{
5265 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5266}
5267
5268PyObject *
5269PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 Py_ssize_t size,
5271 const char *errors,
5272 int *byteorder,
5273 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274{
5275 const char *starts = s;
5276 Py_ssize_t startinpos;
5277 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005278 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005279 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005280 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005281 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005283 PyObject *errorHandler = NULL;
5284 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005285
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286 q = (unsigned char *)s;
5287 e = q + size;
5288
5289 if (byteorder)
5290 bo = *byteorder;
5291
5292 /* Check for BOM marks (U+FEFF) in the input and adjust current
5293 byte order setting accordingly. In native mode, the leading BOM
5294 mark is skipped, in all other modes, it is copied to the output
5295 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005297 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 if (bom == 0x0000FEFF) {
5299 bo = -1;
5300 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 else if (bom == 0xFFFE0000) {
5303 bo = 1;
5304 q += 4;
5305 }
5306 if (byteorder)
5307 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005308 }
5309
Victor Stinnere64322e2012-10-30 23:12:47 +01005310 if (q == e) {
5311 if (consumed)
5312 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005313 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314 }
5315
Victor Stinnere64322e2012-10-30 23:12:47 +01005316#ifdef WORDS_BIGENDIAN
5317 le = bo < 0;
5318#else
5319 le = bo <= 0;
5320#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005321 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005322
Victor Stinner8f674cc2013-04-17 23:02:17 +02005323 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005324 writer.min_length = (e - q + 3) / 4;
5325 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005326 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005327
Victor Stinnere64322e2012-10-30 23:12:47 +01005328 while (1) {
5329 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005331
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005333 enum PyUnicode_Kind kind = writer.kind;
5334 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005336 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005337 if (le) {
5338 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005339 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 if (ch > maxch)
5341 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005342 if (kind != PyUnicode_1BYTE_KIND &&
5343 Py_UNICODE_IS_SURROGATE(ch))
5344 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005345 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 q += 4;
5347 } while (q <= last);
5348 }
5349 else {
5350 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005351 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005352 if (ch > maxch)
5353 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 if (kind != PyUnicode_1BYTE_KIND &&
5355 Py_UNICODE_IS_SURROGATE(ch))
5356 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005358 q += 4;
5359 } while (q <= last);
5360 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005362 }
5363
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005364 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005365 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005366 startinpos = ((const char *)q) - starts;
5367 endinpos = startinpos + 4;
5368 }
5369 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005370 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005372 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005374 startinpos = ((const char *)q) - starts;
5375 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005377 else {
5378 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005379 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005380 goto onError;
5381 q += 4;
5382 continue;
5383 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005384 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005385 startinpos = ((const char *)q) - starts;
5386 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005388
5389 /* The remaining input chars are ignored if the callback
5390 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005391 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005393 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005395 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005397 }
5398
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005401
Walter Dörwald41980ca2007-08-16 21:55:45 +00005402 Py_XDECREF(errorHandler);
5403 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005404 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005407 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408 Py_XDECREF(errorHandler);
5409 Py_XDECREF(exc);
5410 return NULL;
5411}
5412
5413PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005414_PyUnicode_EncodeUTF32(PyObject *str,
5415 const char *errors,
5416 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 enum PyUnicode_Kind kind;
5419 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005420 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005421 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005422 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005423#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005425#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005430 PyObject *errorHandler = NULL;
5431 PyObject *exc = NULL;
5432 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005433
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005434 if (!PyUnicode_Check(str)) {
5435 PyErr_BadArgument();
5436 return NULL;
5437 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005438 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005439 return NULL;
5440 kind = PyUnicode_KIND(str);
5441 data = PyUnicode_DATA(str);
5442 len = PyUnicode_GET_LENGTH(str);
5443
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005444 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005445 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005446 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005447 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005448 if (v == NULL)
5449 return NULL;
5450
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005451 /* output buffer is 4-bytes aligned */
5452 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005453 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005454 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005455 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005456 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005457 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005458
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005463 else
5464 encoding = "utf-32";
5465
5466 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005467 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5468 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005469 }
5470
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005471 pos = 0;
5472 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005474
5475 if (kind == PyUnicode_2BYTE_KIND) {
5476 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5477 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005479 else {
5480 assert(kind == PyUnicode_4BYTE_KIND);
5481 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5482 &out, native_ordering);
5483 }
5484 if (pos == len)
5485 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005486
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005487 rep = unicode_encode_call_errorhandler(
5488 errors, &errorHandler,
5489 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005490 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 if (!rep)
5492 goto error;
5493
5494 if (PyBytes_Check(rep)) {
5495 repsize = PyBytes_GET_SIZE(rep);
5496 if (repsize & 3) {
5497 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005498 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 "surrogates not allowed");
5500 goto error;
5501 }
5502 moreunits = repsize / 4;
5503 }
5504 else {
5505 assert(PyUnicode_Check(rep));
5506 if (PyUnicode_READY(rep) < 0)
5507 goto error;
5508 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5509 if (!PyUnicode_IS_ASCII(rep)) {
5510 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005511 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 "surrogates not allowed");
5513 goto error;
5514 }
5515 }
5516
5517 /* four bytes are reserved for each surrogate */
5518 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005519 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka82a90752017-07-11 07:27:56 +03005520 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 /* integer overflow */
5522 PyErr_NoMemory();
5523 goto error;
5524 }
Serhiy Storchaka82a90752017-07-11 07:27:56 +03005525 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005526 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005527 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005528 }
5529
5530 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005531 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005532 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005533 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005534 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005535 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5536 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005537 }
5538
5539 Py_CLEAR(rep);
5540 }
5541
5542 /* Cut back to size actually needed. This is necessary for, for example,
5543 encoding of a string containing isolated surrogates and the 'ignore'
5544 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005545 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005546 if (nsize != PyBytes_GET_SIZE(v))
5547 _PyBytes_Resize(&v, nsize);
5548 Py_XDECREF(errorHandler);
5549 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005550 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005551 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005552 error:
5553 Py_XDECREF(rep);
5554 Py_XDECREF(errorHandler);
5555 Py_XDECREF(exc);
5556 Py_XDECREF(v);
5557 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005558}
5559
Alexander Belopolsky40018472011-02-26 01:02:56 +00005560PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005561PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5562 Py_ssize_t size,
5563 const char *errors,
5564 int byteorder)
5565{
5566 PyObject *result;
5567 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5568 if (tmp == NULL)
5569 return NULL;
5570 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5571 Py_DECREF(tmp);
5572 return result;
5573}
5574
5575PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005576PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005577{
Victor Stinnerb960b342011-11-20 19:12:52 +01005578 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005579}
5580
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581/* --- UTF-16 Codec ------------------------------------------------------- */
5582
Tim Peters772747b2001-08-09 22:21:55 +00005583PyObject *
5584PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 Py_ssize_t size,
5586 const char *errors,
5587 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588{
Walter Dörwald69652032004-09-07 20:24:22 +00005589 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5590}
5591
5592PyObject *
5593PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 Py_ssize_t size,
5595 const char *errors,
5596 int *byteorder,
5597 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005598{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005599 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005600 Py_ssize_t startinpos;
5601 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005602 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005604 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005606 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 PyObject *errorHandler = NULL;
5608 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005609 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
Tim Peters772747b2001-08-09 22:21:55 +00005611 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
5614 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005615 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005617 /* Check for BOM marks (U+FEFF) in the input and adjust current
5618 byte order setting accordingly. In native mode, the leading BOM
5619 mark is skipped, in all other modes, it is copied to the output
5620 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005621 if (bo == 0 && size >= 2) {
5622 const Py_UCS4 bom = (q[1] << 8) | q[0];
5623 if (bom == 0xFEFF) {
5624 q += 2;
5625 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627 else if (bom == 0xFFFE) {
5628 q += 2;
5629 bo = 1;
5630 }
5631 if (byteorder)
5632 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634
Antoine Pitrou63065d72012-05-15 23:48:04 +02005635 if (q == e) {
5636 if (consumed)
5637 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005638 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005639 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005640
Christian Heimes743e0cd2012-10-17 23:52:17 +02005641#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005643 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005644#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005645 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005646 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005647#endif
Tim Peters772747b2001-08-09 22:21:55 +00005648
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649 /* Note: size will always be longer than the resulting Unicode
5650 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005651 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005652 writer.min_length = (e - q + 1) / 2;
5653 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005654 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005655
Antoine Pitrou63065d72012-05-15 23:48:04 +02005656 while (1) {
5657 Py_UCS4 ch = 0;
5658 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005659 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005660 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005661 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005662 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005663 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005664 native_ordering);
5665 else
5666 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005667 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005668 native_ordering);
5669 } else if (kind == PyUnicode_2BYTE_KIND) {
5670 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005671 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005672 native_ordering);
5673 } else {
5674 assert(kind == PyUnicode_4BYTE_KIND);
5675 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005676 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005677 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005678 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680
Antoine Pitrou63065d72012-05-15 23:48:04 +02005681 switch (ch)
5682 {
5683 case 0:
5684 /* remaining byte at the end? (size should be even) */
5685 if (q == e || consumed)
5686 goto End;
5687 errmsg = "truncated data";
5688 startinpos = ((const char *)q) - starts;
5689 endinpos = ((const char *)e) - starts;
5690 break;
5691 /* The remaining input chars are ignored if the callback
5692 chooses to skip the input */
5693 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005694 q -= 2;
5695 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005696 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005697 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005698 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005699 endinpos = ((const char *)e) - starts;
5700 break;
5701 case 2:
5702 errmsg = "illegal encoding";
5703 startinpos = ((const char *)q) - 2 - starts;
5704 endinpos = startinpos + 2;
5705 break;
5706 case 3:
5707 errmsg = "illegal UTF-16 surrogate";
5708 startinpos = ((const char *)q) - 4 - starts;
5709 endinpos = startinpos + 2;
5710 break;
5711 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005712 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005713 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 continue;
5715 }
5716
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005717 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005718 errors,
5719 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005720 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005721 &starts,
5722 (const char **)&e,
5723 &startinpos,
5724 &endinpos,
5725 &exc,
5726 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005727 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 }
5730
Antoine Pitrou63065d72012-05-15 23:48:04 +02005731End:
Walter Dörwald69652032004-09-07 20:24:22 +00005732 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005737 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005740 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return NULL;
5744}
5745
Tim Peters772747b2001-08-09 22:21:55 +00005746PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005747_PyUnicode_EncodeUTF16(PyObject *str,
5748 const char *errors,
5749 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005751 enum PyUnicode_Kind kind;
5752 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005754 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005755 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005756 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005757#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005758 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005759#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005760 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005761#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005762 const char *encoding;
5763 Py_ssize_t nsize, pos;
5764 PyObject *errorHandler = NULL;
5765 PyObject *exc = NULL;
5766 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005767
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005768 if (!PyUnicode_Check(str)) {
5769 PyErr_BadArgument();
5770 return NULL;
5771 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005772 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005773 return NULL;
5774 kind = PyUnicode_KIND(str);
5775 data = PyUnicode_DATA(str);
5776 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005777
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005778 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005779 if (kind == PyUnicode_4BYTE_KIND) {
5780 const Py_UCS4 *in = (const Py_UCS4 *)data;
5781 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 while (in < end) {
5783 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005784 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005785 }
5786 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005787 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005788 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005790 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005791 nsize = len + pairs + (byteorder == 0);
5792 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005793 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005797 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005798 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005799 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005800 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005801 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005802 }
5803 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005804 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005805 }
Tim Peters772747b2001-08-09 22:21:55 +00005806
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005807 if (kind == PyUnicode_1BYTE_KIND) {
5808 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5809 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005810 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005811
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005812 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005813 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005814 }
5815 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005816 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005817 }
5818 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005819 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005820 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005821
5822 pos = 0;
5823 while (pos < len) {
5824 Py_ssize_t repsize, moreunits;
5825
5826 if (kind == PyUnicode_2BYTE_KIND) {
5827 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5828 &out, native_ordering);
5829 }
5830 else {
5831 assert(kind == PyUnicode_4BYTE_KIND);
5832 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5833 &out, native_ordering);
5834 }
5835 if (pos == len)
5836 break;
5837
5838 rep = unicode_encode_call_errorhandler(
5839 errors, &errorHandler,
5840 encoding, "surrogates not allowed",
5841 str, &exc, pos, pos + 1, &pos);
5842 if (!rep)
5843 goto error;
5844
5845 if (PyBytes_Check(rep)) {
5846 repsize = PyBytes_GET_SIZE(rep);
5847 if (repsize & 1) {
5848 raise_encode_exception(&exc, encoding,
5849 str, pos - 1, pos,
5850 "surrogates not allowed");
5851 goto error;
5852 }
5853 moreunits = repsize / 2;
5854 }
5855 else {
5856 assert(PyUnicode_Check(rep));
5857 if (PyUnicode_READY(rep) < 0)
5858 goto error;
5859 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5860 if (!PyUnicode_IS_ASCII(rep)) {
5861 raise_encode_exception(&exc, encoding,
5862 str, pos - 1, pos,
5863 "surrogates not allowed");
5864 goto error;
5865 }
5866 }
5867
5868 /* two bytes are reserved for each surrogate */
5869 if (moreunits > 1) {
5870 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka82a90752017-07-11 07:27:56 +03005871 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005872 /* integer overflow */
5873 PyErr_NoMemory();
5874 goto error;
5875 }
Serhiy Storchaka82a90752017-07-11 07:27:56 +03005876 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005877 goto error;
5878 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5879 }
5880
5881 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005882 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005883 out += moreunits;
5884 } else /* rep is unicode */ {
5885 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5886 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5887 &out, native_ordering);
5888 }
5889
5890 Py_CLEAR(rep);
5891 }
5892
5893 /* Cut back to size actually needed. This is necessary for, for example,
5894 encoding of a string containing isolated surrogates and the 'ignore' handler
5895 is used. */
5896 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5897 if (nsize != PyBytes_GET_SIZE(v))
5898 _PyBytes_Resize(&v, nsize);
5899 Py_XDECREF(errorHandler);
5900 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005901 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005902 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005903 error:
5904 Py_XDECREF(rep);
5905 Py_XDECREF(errorHandler);
5906 Py_XDECREF(exc);
5907 Py_XDECREF(v);
5908 return NULL;
5909#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910}
5911
Alexander Belopolsky40018472011-02-26 01:02:56 +00005912PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5914 Py_ssize_t size,
5915 const char *errors,
5916 int byteorder)
5917{
5918 PyObject *result;
5919 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5920 if (tmp == NULL)
5921 return NULL;
5922 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5923 Py_DECREF(tmp);
5924 return result;
5925}
5926
5927PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005928PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005930 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931}
5932
5933/* --- Unicode Escape Codec ----------------------------------------------- */
5934
Fredrik Lundh06d12682001-01-24 07:59:11 +00005935static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005936
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937PyObject *
Eric V. Smith56466482016-10-31 14:46:26 -04005938_PyUnicode_DecodeUnicodeEscape(const char *s,
5939 Py_ssize_t size,
5940 const char *errors,
5941 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005944 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 PyObject *errorHandler = NULL;
5947 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005948
Eric V. Smith56466482016-10-31 14:46:26 -04005949 // so we can remember if we've seen an invalid escape char or not
5950 *first_invalid_escape = NULL;
5951
Victor Stinner62ec3312016-09-06 17:04:34 -07005952 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005953 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005954 }
5955 /* Escaped strings will always be longer than the resulting
5956 Unicode string, so we start with size here and then reduce the
5957 length after conversion to the true value.
5958 (but if the error callback returns a long replacement string
5959 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005960 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005961 writer.min_length = size;
5962 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5963 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005964 }
5965
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 end = s + size;
5967 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 unsigned char c = (unsigned char) *s++;
5969 Py_UCS4 ch;
5970 int count;
5971 Py_ssize_t startinpos;
5972 Py_ssize_t endinpos;
5973 const char *message;
5974
5975#define WRITE_ASCII_CHAR(ch) \
5976 do { \
5977 assert(ch <= 127); \
5978 assert(writer.pos < writer.size); \
5979 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5980 } while(0)
5981
5982#define WRITE_CHAR(ch) \
5983 do { \
5984 if (ch <= writer.maxchar) { \
5985 assert(writer.pos < writer.size); \
5986 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5987 } \
5988 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5989 goto onError; \
5990 } \
5991 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
5993 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 if (c != '\\') {
5995 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 continue;
5997 }
5998
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 if (s >= end) {
6002 message = "\\ at end of string";
6003 goto error;
6004 }
6005 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006006
Victor Stinner62ec3312016-09-06 17:04:34 -07006007 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006008 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 case '\n': continue;
6012 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6013 case '\'': WRITE_ASCII_CHAR('\''); continue;
6014 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6015 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006016 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6018 case 't': WRITE_ASCII_CHAR('\t'); continue;
6019 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6020 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006021 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006023 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 case '0': case '1': case '2': case '3':
6028 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006029 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006030 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006031 ch = (ch<<3) + *s++ - '0';
6032 if (s < end && '0' <= *s && *s <= '7') {
6033 ch = (ch<<3) + *s++ - '0';
6034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 WRITE_CHAR(ch);
6037 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* hex escapes */
6040 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006042 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006043 message = "truncated \\xXX escape";
6044 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006048 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 message = "truncated \\uXXXX escape";
6050 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006053 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 message = "truncated \\UXXXXXXXX escape";
6056 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006057 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006058 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 ch <<= 4;
6060 if (c >= '0' && c <= '9') {
6061 ch += c - '0';
6062 }
6063 else if (c >= 'a' && c <= 'f') {
6064 ch += c - ('a' - 10);
6065 }
6066 else if (c >= 'A' && c <= 'F') {
6067 ch += c - ('A' - 10);
6068 }
6069 else {
6070 break;
6071 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006072 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006074 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 }
6076
6077 /* when we get here, ch is a 32-bit unicode character */
6078 if (ch > MAX_UNICODE) {
6079 message = "illegal Unicode character";
6080 goto error;
6081 }
6082
6083 WRITE_CHAR(ch);
6084 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006087 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006088 if (ucnhash_CAPI == NULL) {
6089 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006090 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6091 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 if (ucnhash_CAPI == NULL) {
6093 PyErr_SetString(
6094 PyExc_UnicodeError,
6095 "\\N escapes not supported (can't load unicodedata module)"
6096 );
6097 goto onError;
6098 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006099 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006100
6101 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006102 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006103 const char *start = ++s;
6104 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006105 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006106 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006107 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 namelen = s - start;
6109 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006110 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006111 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006112 ch = 0xffffffff; /* in case 'getcode' messes up */
6113 if (namelen <= INT_MAX &&
6114 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6115 &ch, 0)) {
6116 assert(ch <= MAX_UNICODE);
6117 WRITE_CHAR(ch);
6118 continue;
6119 }
6120 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006121 }
6122 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006123 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006124
6125 default:
Eric V. Smith56466482016-10-31 14:46:26 -04006126 if (*first_invalid_escape == NULL) {
6127 *first_invalid_escape = s-1; /* Back up one char, since we've
6128 already incremented s. */
6129 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006130 WRITE_ASCII_CHAR('\\');
6131 WRITE_CHAR(c);
6132 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006134
6135 error:
6136 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006137 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006138 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006139 errors, &errorHandler,
6140 "unicodeescape", message,
6141 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006143 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 }
6145 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6146 goto onError;
6147 }
6148
6149#undef WRITE_ASCII_CHAR
6150#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006152
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006153 Py_XDECREF(errorHandler);
6154 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006155 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006156
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006158 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 Py_XDECREF(errorHandler);
6160 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 return NULL;
6162}
6163
Eric V. Smith56466482016-10-31 14:46:26 -04006164PyObject *
6165PyUnicode_DecodeUnicodeEscape(const char *s,
6166 Py_ssize_t size,
6167 const char *errors)
6168{
6169 const char *first_invalid_escape;
6170 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6171 &first_invalid_escape);
6172 if (result == NULL)
6173 return NULL;
6174 if (first_invalid_escape != NULL) {
6175 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6176 "invalid escape sequence '\\%c'",
6177 *first_invalid_escape) < 0) {
6178 Py_DECREF(result);
6179 return NULL;
6180 }
6181 }
6182 return result;
6183}
6184
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006185/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006191 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Ezio Melottie7f90372012-10-05 03:33:31 +03006197 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006198 escape.
6199
Ezio Melottie7f90372012-10-05 03:33:31 +03006200 For UCS1 strings it's '\xxx', 4 bytes per source character.
6201 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6202 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006203 */
6204
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 if (!PyUnicode_Check(unicode)) {
6206 PyErr_BadArgument();
6207 return NULL;
6208 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006209 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006210 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 }
Victor Stinner358af132015-10-12 22:36:57 +02006212
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006213 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 if (len == 0) {
6215 return PyBytes_FromStringAndSize(NULL, 0);
6216 }
6217
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 kind = PyUnicode_KIND(unicode);
6219 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006220 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6221 bytes, and 1 byte characters 4. */
6222 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006223 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 return PyErr_NoMemory();
6225 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006226 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006227 if (repr == NULL) {
6228 return NULL;
6229 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006230
Victor Stinner62ec3312016-09-06 17:04:34 -07006231 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006232 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006233 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006234
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 /* U+0000-U+00ff range */
6236 if (ch < 0x100) {
6237 if (ch >= ' ' && ch < 127) {
6238 if (ch != '\\') {
6239 /* Copy printable US ASCII as-is */
6240 *p++ = (char) ch;
6241 }
6242 /* Escape backslashes */
6243 else {
6244 *p++ = '\\';
6245 *p++ = '\\';
6246 }
6247 }
Victor Stinner358af132015-10-12 22:36:57 +02006248
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 /* Map special whitespace to '\t', \n', '\r' */
6250 else if (ch == '\t') {
6251 *p++ = '\\';
6252 *p++ = 't';
6253 }
6254 else if (ch == '\n') {
6255 *p++ = '\\';
6256 *p++ = 'n';
6257 }
6258 else if (ch == '\r') {
6259 *p++ = '\\';
6260 *p++ = 'r';
6261 }
6262
6263 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6264 else {
6265 *p++ = '\\';
6266 *p++ = 'x';
6267 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6268 *p++ = Py_hexdigits[ch & 0x000F];
6269 }
Tim Petersced69f82003-09-16 20:30:58 +00006270 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006271 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 *p++ = '\\';
6274 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006275 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6276 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6277 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6278 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6281 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006282
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 /* Make sure that the first two digits are zero */
6284 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006285 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 *p++ = 'U';
6287 *p++ = '0';
6288 *p++ = '0';
6289 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6290 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6291 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6292 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6293 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6294 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 assert(p - PyBytes_AS_STRING(repr) > 0);
6299 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6300 return NULL;
6301 }
6302 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303}
6304
Alexander Belopolsky40018472011-02-26 01:02:56 +00006305PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6307 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006309 PyObject *result;
6310 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 }
6314
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006315 result = PyUnicode_AsUnicodeEscapeString(tmp);
6316 Py_DECREF(tmp);
6317 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318}
6319
6320/* --- Raw Unicode Escape Codec ------------------------------------------- */
6321
Alexander Belopolsky40018472011-02-26 01:02:56 +00006322PyObject *
6323PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006324 Py_ssize_t size,
6325 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006327 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006328 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330 PyObject *errorHandler = NULL;
6331 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006332
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006334 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006336
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 /* Escaped strings will always be longer than the resulting
6338 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339 length after conversion to the true value. (But decoding error
6340 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006341 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 writer.min_length = size;
6343 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6344 goto onError;
6345 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006346
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 end = s + size;
6348 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 unsigned char c = (unsigned char) *s++;
6350 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006351 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006352 Py_ssize_t startinpos;
6353 Py_ssize_t endinpos;
6354 const char *message;
6355
6356#define WRITE_CHAR(ch) \
6357 do { \
6358 if (ch <= writer.maxchar) { \
6359 assert(writer.pos < writer.size); \
6360 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6361 } \
6362 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6363 goto onError; \
6364 } \
6365 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006368 if (c != '\\' || s >= end) {
6369 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006371 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006372
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 c = (unsigned char) *s++;
6374 if (c == 'u') {
6375 count = 4;
6376 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006378 else if (c == 'U') {
6379 count = 8;
6380 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006381 }
6382 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 assert(writer.pos < writer.size);
6384 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6385 WRITE_CHAR(c);
6386 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006387 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006388 startinpos = s - starts - 2;
6389
6390 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6391 for (ch = 0; count && s < end; ++s, --count) {
6392 c = (unsigned char)*s;
6393 ch <<= 4;
6394 if (c >= '0' && c <= '9') {
6395 ch += c - '0';
6396 }
6397 else if (c >= 'a' && c <= 'f') {
6398 ch += c - ('a' - 10);
6399 }
6400 else if (c >= 'A' && c <= 'F') {
6401 ch += c - ('A' - 10);
6402 }
6403 else {
6404 break;
6405 }
6406 }
6407 if (!count) {
6408 if (ch <= MAX_UNICODE) {
6409 WRITE_CHAR(ch);
6410 continue;
6411 }
6412 message = "\\Uxxxxxxxx out of range";
6413 }
6414
6415 endinpos = s-starts;
6416 writer.min_length = end - s + writer.pos;
6417 if (unicode_decode_call_errorhandler_writer(
6418 errors, &errorHandler,
6419 "rawunicodeescape", message,
6420 &starts, &end, &startinpos, &endinpos, &exc, &s,
6421 &writer)) {
6422 goto onError;
6423 }
6424 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6425 goto onError;
6426 }
6427
6428#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 Py_XDECREF(errorHandler);
6431 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006432 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006433
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006435 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 Py_XDECREF(errorHandler);
6437 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440}
6441
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006442
Alexander Belopolsky40018472011-02-26 01:02:56 +00006443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445{
Victor Stinner62ec3312016-09-06 17:04:34 -07006446 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 int kind;
6450 void *data;
6451 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006453 if (!PyUnicode_Check(unicode)) {
6454 PyErr_BadArgument();
6455 return NULL;
6456 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006460 kind = PyUnicode_KIND(unicode);
6461 data = PyUnicode_DATA(unicode);
6462 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006463 if (kind == PyUnicode_1BYTE_KIND) {
6464 return PyBytes_FromStringAndSize(data, len);
6465 }
Victor Stinner0e368262011-11-10 20:12:49 +01006466
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6468 bytes, and 1 byte characters 4. */
6469 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006470
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 if (len > PY_SSIZE_T_MAX / expandsize) {
6472 return PyErr_NoMemory();
6473 }
6474 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6475 if (repr == NULL) {
6476 return NULL;
6477 }
6478 if (len == 0) {
6479 return repr;
6480 }
6481
6482 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006483 for (pos = 0; pos < len; pos++) {
6484 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006485
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6487 if (ch < 0x100) {
6488 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006489 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6491 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 *p++ = '\\';
6493 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006494 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6495 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6496 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6497 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6500 else {
6501 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6502 *p++ = '\\';
6503 *p++ = 'U';
6504 *p++ = '0';
6505 *p++ = '0';
6506 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6507 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6508 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6509 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6510 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6511 *p++ = Py_hexdigits[ch & 15];
6512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006514
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 assert(p > PyBytes_AS_STRING(repr));
6516 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6517 return NULL;
6518 }
6519 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520}
6521
Alexander Belopolsky40018472011-02-26 01:02:56 +00006522PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006523PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6524 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006526 PyObject *result;
6527 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6528 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006529 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006530 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6531 Py_DECREF(tmp);
6532 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533}
6534
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535/* --- Unicode Internal Codec ------------------------------------------- */
6536
Alexander Belopolsky40018472011-02-26 01:02:56 +00006537PyObject *
6538_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006539 Py_ssize_t size,
6540 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006541{
6542 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006543 Py_ssize_t startinpos;
6544 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006545 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006546 const char *end;
6547 const char *reason;
6548 PyObject *errorHandler = NULL;
6549 PyObject *exc = NULL;
6550
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006551 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006552 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006553 1))
6554 return NULL;
6555
Serhiy Storchaka82a90752017-07-11 07:27:56 +03006556 if (size < 0) {
6557 PyErr_BadInternalCall();
6558 return NULL;
6559 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006560 if (size == 0)
6561 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006562
Victor Stinner8f674cc2013-04-17 23:02:17 +02006563 _PyUnicodeWriter_Init(&writer);
6564 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6565 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006567 }
6568 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006569
Victor Stinner8f674cc2013-04-17 23:02:17 +02006570 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006571 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006572 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006573 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006574 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006575 endinpos = end-starts;
6576 reason = "truncated input";
6577 goto error;
6578 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006579 /* We copy the raw representation one byte at a time because the
6580 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006581 ((char *) &uch)[0] = s[0];
6582 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006583#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006584 ((char *) &uch)[2] = s[2];
6585 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006586#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006587 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006588#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006589 /* We have to sanity check the raw data, otherwise doom looms for
6590 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006591 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006592 endinpos = s - starts + Py_UNICODE_SIZE;
6593 reason = "illegal code point (> 0x10FFFF)";
6594 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006595 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006596#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006597 s += Py_UNICODE_SIZE;
6598#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006599 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006600 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006601 Py_UNICODE uch2;
6602 ((char *) &uch2)[0] = s[0];
6603 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006604 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006605 {
Victor Stinner551ac952011-11-29 22:58:13 +01006606 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006607 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006608 }
6609 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006610#endif
6611
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006612 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006613 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006614 continue;
6615
6616 error:
6617 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006618 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006619 errors, &errorHandler,
6620 "unicode_internal", reason,
6621 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006622 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006623 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006624 }
6625
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006626 Py_XDECREF(errorHandler);
6627 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006628 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006629
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006631 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
6634 return NULL;
6635}
6636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637/* --- Latin-1 Codec ------------------------------------------------------ */
6638
Alexander Belopolsky40018472011-02-26 01:02:56 +00006639PyObject *
6640PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006641 Py_ssize_t size,
6642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006645 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646}
6647
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006649static void
6650make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006651 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006652 PyObject *unicode,
6653 Py_ssize_t startpos, Py_ssize_t endpos,
6654 const char *reason)
6655{
6656 if (*exceptionObject == NULL) {
6657 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006659 encoding, unicode, startpos, endpos, reason);
6660 }
6661 else {
6662 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6663 goto onError;
6664 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6665 goto onError;
6666 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6667 goto onError;
6668 return;
6669 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006670 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006671 }
6672}
6673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006675static void
6676raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006677 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006678 PyObject *unicode,
6679 Py_ssize_t startpos, Py_ssize_t endpos,
6680 const char *reason)
6681{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006682 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006683 encoding, unicode, startpos, endpos, reason);
6684 if (*exceptionObject != NULL)
6685 PyCodec_StrictErrors(*exceptionObject);
6686}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687
6688/* error handling callback helper:
6689 build arguments, call the callback and check the arguments,
6690 put the result into newpos and return the replacement string, which
6691 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006692static PyObject *
6693unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006694 PyObject **errorHandler,
6695 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006696 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006697 Py_ssize_t startpos, Py_ssize_t endpos,
6698 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006700 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006701 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 PyObject *restuple;
6703 PyObject *resunicode;
6704
6705 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 }
6710
Benjamin Petersonbac79492012-01-14 13:34:47 -05006711 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712 return NULL;
6713 len = PyUnicode_GET_LENGTH(unicode);
6714
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006715 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719
6720 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006725 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 Py_DECREF(restuple);
6727 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006729 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 &resunicode, newpos)) {
6731 Py_DECREF(restuple);
6732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006734 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6735 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6736 Py_DECREF(restuple);
6737 return NULL;
6738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 *newpos = len + *newpos;
6741 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006742 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 Py_DECREF(restuple);
6744 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006745 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 Py_INCREF(resunicode);
6747 Py_DECREF(restuple);
6748 return resunicode;
6749}
6750
Alexander Belopolsky40018472011-02-26 01:02:56 +00006751static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006753 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006754 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 /* input state */
6757 Py_ssize_t pos=0, size;
6758 int kind;
6759 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006760 /* pointer into the output */
6761 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006762 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6763 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006764 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006766 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006767 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006768 /* output object */
6769 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770
Benjamin Petersonbac79492012-01-14 13:34:47 -05006771 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006772 return NULL;
6773 size = PyUnicode_GET_LENGTH(unicode);
6774 kind = PyUnicode_KIND(unicode);
6775 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776 /* allocate enough for a simple encoding without
6777 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006778 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006779 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006780
6781 _PyBytesWriter_Init(&writer);
6782 str = _PyBytesWriter_Alloc(&writer, size);
6783 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006784 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006785
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006786 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006787 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006788
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006790 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006792 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006796 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006799 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006801
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006802 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006804
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006805 /* Only overallocate the buffer if it's not the last write */
6806 writer.overallocate = (collend < size);
6807
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006809 if (error_handler == _Py_ERROR_UNKNOWN)
6810 error_handler = get_error_handler(errors);
6811
6812 switch (error_handler) {
6813 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006814 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006816
6817 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006818 memset(str, '?', collend - collstart);
6819 str += (collend - collstart);
Victor Stinnerc0e77362017-09-12 16:09:44 -07006820 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006821 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006822 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 break;
Victor Stinner50149202015-09-22 00:26:54 +02006824
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006825 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006826 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006827 writer.min_size -= (collend - collstart);
6828 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006829 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006830 if (str == NULL)
6831 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006832 pos = collend;
6833 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006834
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006835 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006836 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006837 writer.min_size -= (collend - collstart);
6838 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006839 unicode, collstart, collend);
6840 if (str == NULL)
6841 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006842 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 break;
Victor Stinner50149202015-09-22 00:26:54 +02006844
Victor Stinnerc3713e92015-09-29 12:32:13 +02006845 case _Py_ERROR_SURROGATEESCAPE:
6846 for (i = collstart; i < collend; ++i) {
6847 ch = PyUnicode_READ(kind, data, i);
6848 if (ch < 0xdc80 || 0xdcff < ch) {
6849 /* Not a UTF-8b surrogate */
6850 break;
6851 }
6852 *str++ = (char)(ch - 0xdc00);
6853 ++pos;
6854 }
6855 if (i >= collend)
6856 break;
6857 collstart = pos;
6858 assert(collstart != collend);
Victor Stinnerc0e77362017-09-12 16:09:44 -07006859 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006860
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6863 encoding, reason, unicode, &exc,
6864 collstart, collend, &newpos);
6865 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006867
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006868 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006869 writer.min_size -= 1;
6870
Victor Stinner6bd525b2015-10-09 13:10:05 +02006871 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006872 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006873 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006874 PyBytes_AS_STRING(rep),
6875 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006876 if (str == NULL)
6877 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006878 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006879 else {
6880 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006881
Victor Stinner6bd525b2015-10-09 13:10:05 +02006882 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006884
6885 if (PyUnicode_IS_ASCII(rep)) {
6886 /* Fast path: all characters are smaller than limit */
6887 assert(limit >= 128);
6888 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6889 str = _PyBytesWriter_WriteBytes(&writer, str,
6890 PyUnicode_DATA(rep),
6891 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006893 else {
6894 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6895
6896 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6897 if (str == NULL)
6898 goto onError;
6899
6900 /* check if there is anything unencodable in the
6901 replacement and copy it to the output */
6902 for (i = 0; repsize-->0; ++i, ++str) {
6903 ch = PyUnicode_READ_CHAR(rep, i);
6904 if (ch >= limit) {
6905 raise_encode_exception(&exc, encoding, unicode,
6906 pos, pos+1, reason);
6907 goto onError;
6908 }
6909 *str = (char)ch;
6910 }
6911 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006914 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006915 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006916
6917 /* If overallocation was disabled, ensure that it was the last
6918 write. Otherwise, we missed an optimization */
6919 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006920 }
6921 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006922
Victor Stinner50149202015-09-22 00:26:54 +02006923 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006925 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006926
6927 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006928 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006929 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006930 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006931 Py_XDECREF(exc);
6932 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933}
6934
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006935/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006936PyObject *
6937PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006938 Py_ssize_t size,
6939 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006941 PyObject *result;
6942 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6943 if (unicode == NULL)
6944 return NULL;
6945 result = unicode_encode_ucs1(unicode, errors, 256);
6946 Py_DECREF(unicode);
6947 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948}
6949
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006951_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952{
6953 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 PyErr_BadArgument();
6955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006957 if (PyUnicode_READY(unicode) == -1)
6958 return NULL;
6959 /* Fast path: if it is a one-byte string, construct
6960 bytes object directly. */
6961 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6962 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6963 PyUnicode_GET_LENGTH(unicode));
6964 /* Non-Latin-1 characters present. Defer to above function to
6965 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006966 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006967}
6968
6969PyObject*
6970PyUnicode_AsLatin1String(PyObject *unicode)
6971{
6972 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973}
6974
6975/* --- 7-bit ASCII Codec -------------------------------------------------- */
6976
Alexander Belopolsky40018472011-02-26 01:02:56 +00006977PyObject *
6978PyUnicode_DecodeASCII(const char *s,
6979 Py_ssize_t size,
6980 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006982 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006983 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006984 int kind;
6985 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006986 Py_ssize_t startinpos;
6987 Py_ssize_t endinpos;
6988 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006989 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006990 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006992 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006996
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006998 if (size == 1 && (unsigned char)s[0] < 128)
6999 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007000
Victor Stinner8f674cc2013-04-17 23:02:17 +02007001 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007002 writer.min_length = size;
7003 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02007004 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007008 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 writer.pos = outpos;
7010 if (writer.pos == size)
7011 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007012
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 s += writer.pos;
7014 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007015 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007016 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 PyUnicode_WRITE(kind, data, writer.pos, c);
7019 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007021 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023
7024 /* byte outsize range 0x00..0x7f: call the error handler */
7025
7026 if (error_handler == _Py_ERROR_UNKNOWN)
7027 error_handler = get_error_handler(errors);
7028
7029 switch (error_handler)
7030 {
7031 case _Py_ERROR_REPLACE:
7032 case _Py_ERROR_SURROGATEESCAPE:
7033 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007034 but we may switch to UCS2 at the first write */
7035 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7036 goto onError;
7037 kind = writer.kind;
7038 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007039
7040 if (error_handler == _Py_ERROR_REPLACE)
7041 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7042 else
7043 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7044 writer.pos++;
7045 ++s;
7046 break;
7047
7048 case _Py_ERROR_IGNORE:
7049 ++s;
7050 break;
7051
7052 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 startinpos = s-starts;
7054 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007055 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007056 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 "ascii", "ordinal not in range(128)",
7058 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007059 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007061 kind = writer.kind;
7062 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007065 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007066 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007067 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007068
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007070 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007071 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 return NULL;
7074}
7075
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007076/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007077PyObject *
7078PyUnicode_EncodeASCII(const Py_UNICODE *p,
7079 Py_ssize_t size,
7080 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082 PyObject *result;
7083 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7084 if (unicode == NULL)
7085 return NULL;
7086 result = unicode_encode_ucs1(unicode, errors, 128);
7087 Py_DECREF(unicode);
7088 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089}
7090
Alexander Belopolsky40018472011-02-26 01:02:56 +00007091PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007092_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093{
7094 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 PyErr_BadArgument();
7096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098 if (PyUnicode_READY(unicode) == -1)
7099 return NULL;
7100 /* Fast path: if it is an ASCII-only string, construct bytes object
7101 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007102 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007103 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7104 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007105 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007106}
7107
7108PyObject *
7109PyUnicode_AsASCIIString(PyObject *unicode)
7110{
7111 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112}
7113
Steve Dowercc16be82016-09-08 10:35:16 -07007114#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007115
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007116/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007117
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007118#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119#define NEED_RETRY
7120#endif
7121
Victor Stinner3a50e702011-10-18 21:21:00 +02007122#ifndef WC_ERR_INVALID_CHARS
7123# define WC_ERR_INVALID_CHARS 0x0080
7124#endif
7125
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007126static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007127code_page_name(UINT code_page, PyObject **obj)
7128{
7129 *obj = NULL;
7130 if (code_page == CP_ACP)
7131 return "mbcs";
7132 if (code_page == CP_UTF7)
7133 return "CP_UTF7";
7134 if (code_page == CP_UTF8)
7135 return "CP_UTF8";
7136
7137 *obj = PyBytes_FromFormat("cp%u", code_page);
7138 if (*obj == NULL)
7139 return NULL;
7140 return PyBytes_AS_STRING(*obj);
7141}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142
Victor Stinner3a50e702011-10-18 21:21:00 +02007143static DWORD
7144decode_code_page_flags(UINT code_page)
7145{
7146 if (code_page == CP_UTF7) {
7147 /* The CP_UTF7 decoder only supports flags=0 */
7148 return 0;
7149 }
7150 else
7151 return MB_ERR_INVALID_CHARS;
7152}
7153
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 * Decode a byte string from a Windows code page into unicode object in strict
7156 * mode.
7157 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007158 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7159 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007161static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007162decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007163 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 const char *in,
7165 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166{
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007168 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007170
7171 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 assert(insize > 0);
7173 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7174 if (outsize <= 0)
7175 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007176
7177 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007179 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007180 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 if (*v == NULL)
7182 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007184 }
7185 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007188 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191 }
7192
7193 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7195 if (outsize <= 0)
7196 goto error;
7197 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007198
Victor Stinner3a50e702011-10-18 21:21:00 +02007199error:
7200 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7201 return -2;
7202 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007203 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204}
7205
Victor Stinner3a50e702011-10-18 21:21:00 +02007206/*
7207 * Decode a byte string from a code page into unicode object with an error
7208 * handler.
7209 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007210 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 * UnicodeDecodeError exception and returns -1 on error.
7212 */
7213static int
7214decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 PyObject **v,
7216 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007217 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007218{
7219 const char *startin = in;
7220 const char *endin = in + size;
7221 const DWORD flags = decode_code_page_flags(code_page);
7222 /* Ideally, we should get reason from FormatMessage. This is the Windows
7223 2000 English version of the message. */
7224 const char *reason = "No mapping for the Unicode character exists "
7225 "in the target code page.";
7226 /* each step cannot decode more than 1 character, but a character can be
7227 represented as a surrogate pair */
7228 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007229 int insize;
7230 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 PyObject *errorHandler = NULL;
7232 PyObject *exc = NULL;
7233 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007234 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 DWORD err;
7236 int ret = -1;
7237
7238 assert(size > 0);
7239
7240 encoding = code_page_name(code_page, &encoding_obj);
7241 if (encoding == NULL)
7242 return -1;
7243
Victor Stinner7d00cc12014-03-17 23:08:06 +01007244 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7246 UnicodeDecodeError. */
7247 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7248 if (exc != NULL) {
7249 PyCodec_StrictErrors(exc);
7250 Py_CLEAR(exc);
7251 }
7252 goto error;
7253 }
7254
7255 if (*v == NULL) {
7256 /* Create unicode object */
7257 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7258 PyErr_NoMemory();
7259 goto error;
7260 }
Victor Stinnerab595942011-12-17 04:59:06 +01007261 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007262 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 if (*v == NULL)
7264 goto error;
7265 startout = PyUnicode_AS_UNICODE(*v);
7266 }
7267 else {
7268 /* Extend unicode object */
7269 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7270 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7271 PyErr_NoMemory();
7272 goto error;
7273 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007274 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 goto error;
7276 startout = PyUnicode_AS_UNICODE(*v) + n;
7277 }
7278
7279 /* Decode the byte string character per character */
7280 out = startout;
7281 while (in < endin)
7282 {
7283 /* Decode a character */
7284 insize = 1;
7285 do
7286 {
7287 outsize = MultiByteToWideChar(code_page, flags,
7288 in, insize,
7289 buffer, Py_ARRAY_LENGTH(buffer));
7290 if (outsize > 0)
7291 break;
7292 err = GetLastError();
7293 if (err != ERROR_NO_UNICODE_TRANSLATION
7294 && err != ERROR_INSUFFICIENT_BUFFER)
7295 {
7296 PyErr_SetFromWindowsErr(0);
7297 goto error;
7298 }
7299 insize++;
7300 }
7301 /* 4=maximum length of a UTF-8 sequence */
7302 while (insize <= 4 && (in + insize) <= endin);
7303
7304 if (outsize <= 0) {
7305 Py_ssize_t startinpos, endinpos, outpos;
7306
Victor Stinner7d00cc12014-03-17 23:08:06 +01007307 /* last character in partial decode? */
7308 if (in + insize >= endin && !final)
7309 break;
7310
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 startinpos = in - startin;
7312 endinpos = startinpos + 1;
7313 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007314 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 errors, &errorHandler,
7316 encoding, reason,
7317 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007318 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 {
7320 goto error;
7321 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007322 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 }
7324 else {
7325 in += insize;
7326 memcpy(out, buffer, outsize * sizeof(wchar_t));
7327 out += outsize;
7328 }
7329 }
7330
7331 /* write a NUL character at the end */
7332 *out = 0;
7333
7334 /* Extend unicode object */
7335 outsize = out - startout;
7336 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007337 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007339 /* (in - startin) <= size and size is an int */
7340 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007341
7342error:
7343 Py_XDECREF(encoding_obj);
7344 Py_XDECREF(errorHandler);
7345 Py_XDECREF(exc);
7346 return ret;
7347}
7348
Victor Stinner3a50e702011-10-18 21:21:00 +02007349static PyObject *
7350decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007351 const char *s, Py_ssize_t size,
7352 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353{
Victor Stinner76a31a62011-11-04 00:05:13 +01007354 PyObject *v = NULL;
7355 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 if (code_page < 0) {
7358 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7359 return NULL;
7360 }
Serhiy Storchaka82a90752017-07-11 07:27:56 +03007361 if (size < 0) {
7362 PyErr_BadInternalCall();
7363 return NULL;
7364 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007365
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368
Victor Stinner76a31a62011-11-04 00:05:13 +01007369 do
7370 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007372 if (size > INT_MAX) {
7373 chunk_size = INT_MAX;
7374 final = 0;
7375 done = 0;
7376 }
7377 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007378#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007379 {
7380 chunk_size = (int)size;
7381 final = (consumed == NULL);
7382 done = 1;
7383 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007384
Victor Stinner76a31a62011-11-04 00:05:13 +01007385 if (chunk_size == 0 && done) {
7386 if (v != NULL)
7387 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007388 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007389 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390
Victor Stinner76a31a62011-11-04 00:05:13 +01007391 converted = decode_code_page_strict(code_page, &v,
7392 s, chunk_size);
7393 if (converted == -2)
7394 converted = decode_code_page_errors(code_page, &v,
7395 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007396 errors, final);
7397 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007398
7399 if (converted < 0) {
7400 Py_XDECREF(v);
7401 return NULL;
7402 }
7403
7404 if (consumed)
7405 *consumed += converted;
7406
7407 s += converted;
7408 size -= converted;
7409 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007410
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007411 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412}
7413
Alexander Belopolsky40018472011-02-26 01:02:56 +00007414PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007415PyUnicode_DecodeCodePageStateful(int code_page,
7416 const char *s,
7417 Py_ssize_t size,
7418 const char *errors,
7419 Py_ssize_t *consumed)
7420{
7421 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7422}
7423
7424PyObject *
7425PyUnicode_DecodeMBCSStateful(const char *s,
7426 Py_ssize_t size,
7427 const char *errors,
7428 Py_ssize_t *consumed)
7429{
7430 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7431}
7432
7433PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007434PyUnicode_DecodeMBCS(const char *s,
7435 Py_ssize_t size,
7436 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007437{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7439}
7440
Victor Stinner3a50e702011-10-18 21:21:00 +02007441static DWORD
7442encode_code_page_flags(UINT code_page, const char *errors)
7443{
7444 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007445 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 }
7447 else if (code_page == CP_UTF7) {
7448 /* CP_UTF7 only supports flags=0 */
7449 return 0;
7450 }
7451 else {
7452 if (errors != NULL && strcmp(errors, "replace") == 0)
7453 return 0;
7454 else
7455 return WC_NO_BEST_FIT_CHARS;
7456 }
7457}
7458
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007459/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 * Encode a Unicode string to a Windows code page into a byte string in strict
7461 * mode.
7462 *
7463 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007464 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007466static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007467encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470{
Victor Stinner554f3f02010-06-16 23:33:54 +00007471 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 BOOL *pusedDefaultChar = &usedDefaultChar;
7473 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007474 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 const DWORD flags = encode_code_page_flags(code_page, NULL);
7477 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007478 /* Create a substring so that we can get the UTF-16 representation
7479 of just the slice under consideration. */
7480 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007483
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007485 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007487 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007488
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 substring = PyUnicode_Substring(unicode, offset, offset+len);
7490 if (substring == NULL)
7491 return -1;
7492 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7493 if (p == NULL) {
7494 Py_DECREF(substring);
7495 return -1;
7496 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007497 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007498
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007499 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007501 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 NULL, 0,
7503 NULL, pusedDefaultChar);
7504 if (outsize <= 0)
7505 goto error;
7506 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007507 if (pusedDefaultChar && *pusedDefaultChar) {
7508 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007510 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007511
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 if (*outbytes == NULL) {
7516 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007518 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007520 }
7521 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 const Py_ssize_t n = PyBytes_Size(*outbytes);
7524 if (outsize > PY_SSIZE_T_MAX - n) {
7525 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007526 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007529 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7530 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007532 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007534 }
7535
7536 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007538 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 out, outsize,
7540 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007541 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 if (outsize <= 0)
7543 goto error;
7544 if (pusedDefaultChar && *pusedDefaultChar)
7545 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007547
Victor Stinner3a50e702011-10-18 21:21:00 +02007548error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007549 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7551 return -2;
7552 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007553 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007554}
7555
Victor Stinner3a50e702011-10-18 21:21:00 +02007556/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007557 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 * error handler.
7559 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007560 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 * -1 on other error.
7562 */
7563static int
7564encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007565 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007567{
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007569 Py_ssize_t pos = unicode_offset;
7570 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 /* Ideally, we should get reason from FormatMessage. This is the Windows
7572 2000 English version of the message. */
7573 const char *reason = "invalid character";
7574 /* 4=maximum length of a UTF-8 sequence */
7575 char buffer[4];
7576 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7577 Py_ssize_t outsize;
7578 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 PyObject *errorHandler = NULL;
7580 PyObject *exc = NULL;
7581 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007582 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007583 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 PyObject *rep;
7585 int ret = -1;
7586
7587 assert(insize > 0);
7588
7589 encoding = code_page_name(code_page, &encoding_obj);
7590 if (encoding == NULL)
7591 return -1;
7592
7593 if (errors == NULL || strcmp(errors, "strict") == 0) {
7594 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7595 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007596 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007597 if (exc != NULL) {
7598 PyCodec_StrictErrors(exc);
7599 Py_DECREF(exc);
7600 }
7601 Py_XDECREF(encoding_obj);
7602 return -1;
7603 }
7604
7605 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7606 pusedDefaultChar = &usedDefaultChar;
7607 else
7608 pusedDefaultChar = NULL;
7609
7610 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7611 PyErr_NoMemory();
7612 goto error;
7613 }
7614 outsize = insize * Py_ARRAY_LENGTH(buffer);
7615
7616 if (*outbytes == NULL) {
7617 /* Create string object */
7618 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7619 if (*outbytes == NULL)
7620 goto error;
7621 out = PyBytes_AS_STRING(*outbytes);
7622 }
7623 else {
7624 /* Extend string object */
7625 Py_ssize_t n = PyBytes_Size(*outbytes);
7626 if (n > PY_SSIZE_T_MAX - outsize) {
7627 PyErr_NoMemory();
7628 goto error;
7629 }
7630 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7631 goto error;
7632 out = PyBytes_AS_STRING(*outbytes) + n;
7633 }
7634
7635 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007636 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007637 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007638 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7639 wchar_t chars[2];
7640 int charsize;
7641 if (ch < 0x10000) {
7642 chars[0] = (wchar_t)ch;
7643 charsize = 1;
7644 }
7645 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007646 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7647 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007648 charsize = 2;
7649 }
7650
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007652 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 buffer, Py_ARRAY_LENGTH(buffer),
7654 NULL, pusedDefaultChar);
7655 if (outsize > 0) {
7656 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7657 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007658 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007659 memcpy(out, buffer, outsize);
7660 out += outsize;
7661 continue;
7662 }
7663 }
7664 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7665 PyErr_SetFromWindowsErr(0);
7666 goto error;
7667 }
7668
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 rep = unicode_encode_call_errorhandler(
7670 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007671 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007672 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 if (rep == NULL)
7674 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007675 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007676
7677 if (PyBytes_Check(rep)) {
7678 outsize = PyBytes_GET_SIZE(rep);
7679 if (outsize != 1) {
7680 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7681 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7682 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7683 Py_DECREF(rep);
7684 goto error;
7685 }
7686 out = PyBytes_AS_STRING(*outbytes) + offset;
7687 }
7688 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7689 out += outsize;
7690 }
7691 else {
7692 Py_ssize_t i;
7693 enum PyUnicode_Kind kind;
7694 void *data;
7695
Benjamin Petersonbac79492012-01-14 13:34:47 -05007696 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007697 Py_DECREF(rep);
7698 goto error;
7699 }
7700
7701 outsize = PyUnicode_GET_LENGTH(rep);
7702 if (outsize != 1) {
7703 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7704 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7705 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7706 Py_DECREF(rep);
7707 goto error;
7708 }
7709 out = PyBytes_AS_STRING(*outbytes) + offset;
7710 }
7711 kind = PyUnicode_KIND(rep);
7712 data = PyUnicode_DATA(rep);
7713 for (i=0; i < outsize; i++) {
7714 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7715 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007716 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007717 encoding, unicode,
7718 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007719 "unable to encode error handler result to ASCII");
7720 Py_DECREF(rep);
7721 goto error;
7722 }
7723 *out = (unsigned char)ch;
7724 out++;
7725 }
7726 }
7727 Py_DECREF(rep);
7728 }
7729 /* write a NUL byte */
7730 *out = 0;
7731 outsize = out - PyBytes_AS_STRING(*outbytes);
7732 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7733 if (_PyBytes_Resize(outbytes, outsize) < 0)
7734 goto error;
7735 ret = 0;
7736
7737error:
7738 Py_XDECREF(encoding_obj);
7739 Py_XDECREF(errorHandler);
7740 Py_XDECREF(exc);
7741 return ret;
7742}
7743
Victor Stinner3a50e702011-10-18 21:21:00 +02007744static PyObject *
7745encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007746 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007747 const char *errors)
7748{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007749 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007750 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007751 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007752 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007753
Victor Stinner29dacf22015-01-26 16:41:32 +01007754 if (!PyUnicode_Check(unicode)) {
7755 PyErr_BadArgument();
7756 return NULL;
7757 }
7758
Benjamin Petersonbac79492012-01-14 13:34:47 -05007759 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007760 return NULL;
7761 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007762
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 if (code_page < 0) {
7764 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7765 return NULL;
7766 }
7767
Martin v. Löwis3d325192011-11-04 18:23:06 +01007768 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007769 return PyBytes_FromStringAndSize(NULL, 0);
7770
Victor Stinner7581cef2011-11-03 22:32:33 +01007771 offset = 0;
7772 do
7773 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007774#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007775 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007776 chunks. */
7777 if (len > INT_MAX/2) {
7778 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007779 done = 0;
7780 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007781 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007783 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007785 done = 1;
7786 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007787
Victor Stinner76a31a62011-11-04 00:05:13 +01007788 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007789 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007790 errors);
7791 if (ret == -2)
7792 ret = encode_code_page_errors(code_page, &outbytes,
7793 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007794 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007795 if (ret < 0) {
7796 Py_XDECREF(outbytes);
7797 return NULL;
7798 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007799
Victor Stinner7581cef2011-11-03 22:32:33 +01007800 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007801 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007802 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007803
Victor Stinner3a50e702011-10-18 21:21:00 +02007804 return outbytes;
7805}
7806
7807PyObject *
7808PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7809 Py_ssize_t size,
7810 const char *errors)
7811{
Victor Stinner7581cef2011-11-03 22:32:33 +01007812 PyObject *unicode, *res;
7813 unicode = PyUnicode_FromUnicode(p, size);
7814 if (unicode == NULL)
7815 return NULL;
7816 res = encode_code_page(CP_ACP, unicode, errors);
7817 Py_DECREF(unicode);
7818 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007819}
7820
7821PyObject *
7822PyUnicode_EncodeCodePage(int code_page,
7823 PyObject *unicode,
7824 const char *errors)
7825{
Victor Stinner7581cef2011-11-03 22:32:33 +01007826 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007827}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007828
Alexander Belopolsky40018472011-02-26 01:02:56 +00007829PyObject *
7830PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007831{
Victor Stinner7581cef2011-11-03 22:32:33 +01007832 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007833}
7834
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007835#undef NEED_RETRY
7836
Steve Dowercc16be82016-09-08 10:35:16 -07007837#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007838
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839/* --- Character Mapping Codec -------------------------------------------- */
7840
Victor Stinnerfb161b12013-04-18 01:44:27 +02007841static int
7842charmap_decode_string(const char *s,
7843 Py_ssize_t size,
7844 PyObject *mapping,
7845 const char *errors,
7846 _PyUnicodeWriter *writer)
7847{
7848 const char *starts = s;
7849 const char *e;
7850 Py_ssize_t startinpos, endinpos;
7851 PyObject *errorHandler = NULL, *exc = NULL;
7852 Py_ssize_t maplen;
7853 enum PyUnicode_Kind mapkind;
7854 void *mapdata;
7855 Py_UCS4 x;
7856 unsigned char ch;
7857
7858 if (PyUnicode_READY(mapping) == -1)
7859 return -1;
7860
7861 maplen = PyUnicode_GET_LENGTH(mapping);
7862 mapdata = PyUnicode_DATA(mapping);
7863 mapkind = PyUnicode_KIND(mapping);
7864
7865 e = s + size;
7866
7867 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7868 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7869 * is disabled in encoding aliases, latin1 is preferred because
7870 * its implementation is faster. */
7871 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7872 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7873 Py_UCS4 maxchar = writer->maxchar;
7874
7875 assert (writer->kind == PyUnicode_1BYTE_KIND);
7876 while (s < e) {
7877 ch = *s;
7878 x = mapdata_ucs1[ch];
7879 if (x > maxchar) {
7880 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7881 goto onError;
7882 maxchar = writer->maxchar;
7883 outdata = (Py_UCS1 *)writer->data;
7884 }
7885 outdata[writer->pos] = x;
7886 writer->pos++;
7887 ++s;
7888 }
7889 return 0;
7890 }
7891
7892 while (s < e) {
7893 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7894 enum PyUnicode_Kind outkind = writer->kind;
7895 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7896 if (outkind == PyUnicode_1BYTE_KIND) {
7897 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7898 Py_UCS4 maxchar = writer->maxchar;
7899 while (s < e) {
7900 ch = *s;
7901 x = mapdata_ucs2[ch];
7902 if (x > maxchar)
7903 goto Error;
7904 outdata[writer->pos] = x;
7905 writer->pos++;
7906 ++s;
7907 }
7908 break;
7909 }
7910 else if (outkind == PyUnicode_2BYTE_KIND) {
7911 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7912 while (s < e) {
7913 ch = *s;
7914 x = mapdata_ucs2[ch];
7915 if (x == 0xFFFE)
7916 goto Error;
7917 outdata[writer->pos] = x;
7918 writer->pos++;
7919 ++s;
7920 }
7921 break;
7922 }
7923 }
7924 ch = *s;
7925
7926 if (ch < maplen)
7927 x = PyUnicode_READ(mapkind, mapdata, ch);
7928 else
7929 x = 0xfffe; /* invalid value */
7930Error:
7931 if (x == 0xfffe)
7932 {
7933 /* undefined mapping */
7934 startinpos = s-starts;
7935 endinpos = startinpos+1;
7936 if (unicode_decode_call_errorhandler_writer(
7937 errors, &errorHandler,
7938 "charmap", "character maps to <undefined>",
7939 &starts, &e, &startinpos, &endinpos, &exc, &s,
7940 writer)) {
7941 goto onError;
7942 }
7943 continue;
7944 }
7945
7946 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7947 goto onError;
7948 ++s;
7949 }
7950 Py_XDECREF(errorHandler);
7951 Py_XDECREF(exc);
7952 return 0;
7953
7954onError:
7955 Py_XDECREF(errorHandler);
7956 Py_XDECREF(exc);
7957 return -1;
7958}
7959
7960static int
7961charmap_decode_mapping(const char *s,
7962 Py_ssize_t size,
7963 PyObject *mapping,
7964 const char *errors,
7965 _PyUnicodeWriter *writer)
7966{
7967 const char *starts = s;
7968 const char *e;
7969 Py_ssize_t startinpos, endinpos;
7970 PyObject *errorHandler = NULL, *exc = NULL;
7971 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007972 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007973
7974 e = s + size;
7975
7976 while (s < e) {
7977 ch = *s;
7978
7979 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7980 key = PyLong_FromLong((long)ch);
7981 if (key == NULL)
7982 goto onError;
7983
7984 item = PyObject_GetItem(mapping, key);
7985 Py_DECREF(key);
7986 if (item == NULL) {
7987 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7988 /* No mapping found means: mapping is undefined. */
7989 PyErr_Clear();
7990 goto Undefined;
7991 } else
7992 goto onError;
7993 }
7994
7995 /* Apply mapping */
7996 if (item == Py_None)
7997 goto Undefined;
7998 if (PyLong_Check(item)) {
7999 long value = PyLong_AS_LONG(item);
8000 if (value == 0xFFFE)
8001 goto Undefined;
8002 if (value < 0 || value > MAX_UNICODE) {
8003 PyErr_Format(PyExc_TypeError,
8004 "character mapping must be in range(0x%lx)",
8005 (unsigned long)MAX_UNICODE + 1);
8006 goto onError;
8007 }
8008
8009 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8010 goto onError;
8011 }
8012 else if (PyUnicode_Check(item)) {
8013 if (PyUnicode_READY(item) == -1)
8014 goto onError;
8015 if (PyUnicode_GET_LENGTH(item) == 1) {
8016 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8017 if (value == 0xFFFE)
8018 goto Undefined;
8019 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8020 goto onError;
8021 }
8022 else {
8023 writer->overallocate = 1;
8024 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8025 goto onError;
8026 }
8027 }
8028 else {
8029 /* wrong return value */
8030 PyErr_SetString(PyExc_TypeError,
8031 "character mapping must return integer, None or str");
8032 goto onError;
8033 }
8034 Py_CLEAR(item);
8035 ++s;
8036 continue;
8037
8038Undefined:
8039 /* undefined mapping */
8040 Py_CLEAR(item);
8041 startinpos = s-starts;
8042 endinpos = startinpos+1;
8043 if (unicode_decode_call_errorhandler_writer(
8044 errors, &errorHandler,
8045 "charmap", "character maps to <undefined>",
8046 &starts, &e, &startinpos, &endinpos, &exc, &s,
8047 writer)) {
8048 goto onError;
8049 }
8050 }
8051 Py_XDECREF(errorHandler);
8052 Py_XDECREF(exc);
8053 return 0;
8054
8055onError:
8056 Py_XDECREF(item);
8057 Py_XDECREF(errorHandler);
8058 Py_XDECREF(exc);
8059 return -1;
8060}
8061
Alexander Belopolsky40018472011-02-26 01:02:56 +00008062PyObject *
8063PyUnicode_DecodeCharmap(const char *s,
8064 Py_ssize_t size,
8065 PyObject *mapping,
8066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008068 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008069
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 /* Default to Latin-1 */
8071 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008075 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008076 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008077 writer.min_length = size;
8078 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008080
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008081 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008082 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8083 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008084 }
8085 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008086 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8087 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008089 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008090
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008092 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 return NULL;
8094}
8095
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096/* Charmap encoding: the lookup table */
8097
Alexander Belopolsky40018472011-02-26 01:02:56 +00008098struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 PyObject_HEAD
8100 unsigned char level1[32];
8101 int count2, count3;
8102 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103};
8104
8105static PyObject*
8106encoding_map_size(PyObject *obj, PyObject* args)
8107{
8108 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111}
8112
8113static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 PyDoc_STR("Return the size (in bytes) of this object") },
8116 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117};
8118
8119static void
8120encoding_map_dealloc(PyObject* o)
8121{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008122 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123}
8124
8125static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 "EncodingMap", /*tp_name*/
8128 sizeof(struct encoding_map), /*tp_basicsize*/
8129 0, /*tp_itemsize*/
8130 /* methods */
8131 encoding_map_dealloc, /*tp_dealloc*/
8132 0, /*tp_print*/
8133 0, /*tp_getattr*/
8134 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008135 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 0, /*tp_repr*/
8137 0, /*tp_as_number*/
8138 0, /*tp_as_sequence*/
8139 0, /*tp_as_mapping*/
8140 0, /*tp_hash*/
8141 0, /*tp_call*/
8142 0, /*tp_str*/
8143 0, /*tp_getattro*/
8144 0, /*tp_setattro*/
8145 0, /*tp_as_buffer*/
8146 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8147 0, /*tp_doc*/
8148 0, /*tp_traverse*/
8149 0, /*tp_clear*/
8150 0, /*tp_richcompare*/
8151 0, /*tp_weaklistoffset*/
8152 0, /*tp_iter*/
8153 0, /*tp_iternext*/
8154 encoding_map_methods, /*tp_methods*/
8155 0, /*tp_members*/
8156 0, /*tp_getset*/
8157 0, /*tp_base*/
8158 0, /*tp_dict*/
8159 0, /*tp_descr_get*/
8160 0, /*tp_descr_set*/
8161 0, /*tp_dictoffset*/
8162 0, /*tp_init*/
8163 0, /*tp_alloc*/
8164 0, /*tp_new*/
8165 0, /*tp_free*/
8166 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167};
8168
8169PyObject*
8170PyUnicode_BuildEncodingMap(PyObject* string)
8171{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 PyObject *result;
8173 struct encoding_map *mresult;
8174 int i;
8175 int need_dict = 0;
8176 unsigned char level1[32];
8177 unsigned char level2[512];
8178 unsigned char *mlevel1, *mlevel2, *mlevel3;
8179 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 int kind;
8181 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008182 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008184
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008185 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008186 PyErr_BadArgument();
8187 return NULL;
8188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 kind = PyUnicode_KIND(string);
8190 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008191 length = PyUnicode_GET_LENGTH(string);
8192 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 memset(level1, 0xFF, sizeof level1);
8194 memset(level2, 0xFF, sizeof level2);
8195
8196 /* If there isn't a one-to-one mapping of NULL to \0,
8197 or if there are non-BMP characters, we need to use
8198 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008201 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 ch = PyUnicode_READ(kind, data, i);
8204 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205 need_dict = 1;
8206 break;
8207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008209 /* unmapped character */
8210 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 l1 = ch >> 11;
8212 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 if (level1[l1] == 0xFF)
8214 level1[l1] = count2++;
8215 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008216 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 }
8218
8219 if (count2 >= 0xFF || count3 >= 0xFF)
8220 need_dict = 1;
8221
8222 if (need_dict) {
8223 PyObject *result = PyDict_New();
8224 PyObject *key, *value;
8225 if (!result)
8226 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008227 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008229 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008230 if (!key || !value)
8231 goto failed1;
8232 if (PyDict_SetItem(result, key, value) == -1)
8233 goto failed1;
8234 Py_DECREF(key);
8235 Py_DECREF(value);
8236 }
8237 return result;
8238 failed1:
8239 Py_XDECREF(key);
8240 Py_XDECREF(value);
8241 Py_DECREF(result);
8242 return NULL;
8243 }
8244
8245 /* Create a three-level trie */
8246 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8247 16*count2 + 128*count3 - 1);
8248 if (!result)
8249 return PyErr_NoMemory();
8250 PyObject_Init(result, &EncodingMapType);
8251 mresult = (struct encoding_map*)result;
8252 mresult->count2 = count2;
8253 mresult->count3 = count3;
8254 mlevel1 = mresult->level1;
8255 mlevel2 = mresult->level23;
8256 mlevel3 = mresult->level23 + 16*count2;
8257 memcpy(mlevel1, level1, 32);
8258 memset(mlevel2, 0xFF, 16*count2);
8259 memset(mlevel3, 0, 128*count3);
8260 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008261 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008263 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8264 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265 /* unmapped character */
8266 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008267 o1 = ch>>11;
8268 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269 i2 = 16*mlevel1[o1] + o2;
8270 if (mlevel2[i2] == 0xFF)
8271 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008272 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 i3 = 128*mlevel2[i2] + o3;
8274 mlevel3[i3] = i;
8275 }
8276 return result;
8277}
8278
8279static int
Victor Stinner22168992011-11-20 17:09:18 +01008280encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281{
8282 struct encoding_map *map = (struct encoding_map*)mapping;
8283 int l1 = c>>11;
8284 int l2 = (c>>7) & 0xF;
8285 int l3 = c & 0x7F;
8286 int i;
8287
Victor Stinner22168992011-11-20 17:09:18 +01008288 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 if (c == 0)
8291 return 0;
8292 /* level 1*/
8293 i = map->level1[l1];
8294 if (i == 0xFF) {
8295 return -1;
8296 }
8297 /* level 2*/
8298 i = map->level23[16*i+l2];
8299 if (i == 0xFF) {
8300 return -1;
8301 }
8302 /* level 3 */
8303 i = map->level23[16*map->count2 + 128*i + l3];
8304 if (i == 0) {
8305 return -1;
8306 }
8307 return i;
8308}
8309
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310/* Lookup the character ch in the mapping. If the character
8311 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008312 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008314charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315{
Christian Heimes217cfd12007-12-02 14:31:20 +00008316 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 PyObject *x;
8318
8319 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 x = PyObject_GetItem(mapping, w);
8322 Py_DECREF(w);
8323 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8325 /* No mapping found means: mapping is undefined. */
8326 PyErr_Clear();
8327 x = Py_None;
8328 Py_INCREF(x);
8329 return x;
8330 } else
8331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008333 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008335 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 long value = PyLong_AS_LONG(x);
8337 if (value < 0 || value > 255) {
8338 PyErr_SetString(PyExc_TypeError,
8339 "character mapping must be in range(256)");
8340 Py_DECREF(x);
8341 return NULL;
8342 }
8343 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008345 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 /* wrong return value */
8349 PyErr_Format(PyExc_TypeError,
8350 "character mapping must return integer, bytes or None, not %.400s",
8351 x->ob_type->tp_name);
8352 Py_DECREF(x);
8353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355}
8356
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008358charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008359{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008360 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8361 /* exponentially overallocate to minimize reallocations */
8362 if (requiredsize < 2*outsize)
8363 requiredsize = 2*outsize;
8364 if (_PyBytes_Resize(outobj, requiredsize))
8365 return -1;
8366 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008367}
8368
Benjamin Peterson14339b62009-01-31 16:36:08 +00008369typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008373 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 space is available. Return a new reference to the object that
8375 was put in the output buffer, or Py_None, if the mapping was undefined
8376 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008377 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008378static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008379charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008380 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008382 PyObject *rep;
8383 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008384 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385
Christian Heimes90aa7642007-12-19 02:45:37 +00008386 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008387 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008389 if (res == -1)
8390 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 if (outsize<requiredsize)
8392 if (charmapencode_resize(outobj, outpos, requiredsize))
8393 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008394 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 outstart[(*outpos)++] = (char)res;
8396 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008397 }
8398
8399 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008402 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 Py_DECREF(rep);
8404 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 if (PyLong_Check(rep)) {
8407 Py_ssize_t requiredsize = *outpos+1;
8408 if (outsize<requiredsize)
8409 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8410 Py_DECREF(rep);
8411 return enc_EXCEPTION;
8412 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008413 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 else {
8417 const char *repchars = PyBytes_AS_STRING(rep);
8418 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8419 Py_ssize_t requiredsize = *outpos+repsize;
8420 if (outsize<requiredsize)
8421 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8422 Py_DECREF(rep);
8423 return enc_EXCEPTION;
8424 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008425 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 memcpy(outstart + *outpos, repchars, repsize);
8427 *outpos += repsize;
8428 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 Py_DECREF(rep);
8431 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432}
8433
8434/* handle an error in PyUnicode_EncodeCharmap
8435 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008436static int
8437charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008438 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008440 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008441 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442{
8443 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008444 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008445 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008446 enum PyUnicode_Kind kind;
8447 void *data;
8448 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 Py_ssize_t collstartpos = *inpos;
8451 Py_ssize_t collendpos = *inpos+1;
8452 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 char *encoding = "charmap";
8454 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008456 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008457 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458
Benjamin Petersonbac79492012-01-14 13:34:47 -05008459 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008460 return -1;
8461 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 /* find all unencodable characters */
8463 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008464 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008465 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008466 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008467 val = encoding_map_lookup(ch, mapping);
8468 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 break;
8470 ++collendpos;
8471 continue;
8472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008474 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8475 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 if (rep==NULL)
8477 return -1;
8478 else if (rep!=Py_None) {
8479 Py_DECREF(rep);
8480 break;
8481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484 }
8485 /* cache callback name lookup
8486 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008487 if (*error_handler == _Py_ERROR_UNKNOWN)
8488 *error_handler = get_error_handler(errors);
8489
8490 switch (*error_handler) {
8491 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008492 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008493 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008494
8495 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008496 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 x = charmapencode_output('?', mapping, res, respos);
8498 if (x==enc_EXCEPTION) {
8499 return -1;
8500 }
8501 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008502 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return -1;
8504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 }
8506 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008507 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 *inpos = collendpos;
8509 break;
Victor Stinner50149202015-09-22 00:26:54 +02008510
8511 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008512 /* generate replacement (temporarily (mis)uses p) */
8513 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 char buffer[2+29+1+1];
8515 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 for (cp = buffer; *cp; ++cp) {
8518 x = charmapencode_output(*cp, mapping, res, respos);
8519 if (x==enc_EXCEPTION)
8520 return -1;
8521 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008522 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return -1;
8524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008525 }
8526 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008527 *inpos = collendpos;
8528 break;
Victor Stinner50149202015-09-22 00:26:54 +02008529
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 default:
Victor Stinner50149202015-09-22 00:26:54 +02008531 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008532 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008536 if (PyBytes_Check(repunicode)) {
8537 /* Directly copy bytes result to output. */
8538 Py_ssize_t outsize = PyBytes_Size(*res);
8539 Py_ssize_t requiredsize;
8540 repsize = PyBytes_Size(repunicode);
8541 requiredsize = *respos + repsize;
8542 if (requiredsize > outsize)
8543 /* Make room for all additional bytes. */
8544 if (charmapencode_resize(res, respos, requiredsize)) {
8545 Py_DECREF(repunicode);
8546 return -1;
8547 }
8548 memcpy(PyBytes_AsString(*res) + *respos,
8549 PyBytes_AsString(repunicode), repsize);
8550 *respos += repsize;
8551 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008552 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008553 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008554 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008555 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008556 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008557 Py_DECREF(repunicode);
8558 return -1;
8559 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008560 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008561 data = PyUnicode_DATA(repunicode);
8562 kind = PyUnicode_KIND(repunicode);
8563 for (index = 0; index < repsize; index++) {
8564 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8565 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008567 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 return -1;
8569 }
8570 else if (x==enc_FAILED) {
8571 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008572 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 return -1;
8574 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008575 }
8576 *inpos = newpos;
8577 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 }
8579 return 0;
8580}
8581
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008583_PyUnicode_EncodeCharmap(PyObject *unicode,
8584 PyObject *mapping,
8585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 /* output object */
8588 PyObject *res = NULL;
8589 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008590 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008591 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008593 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008594 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008596 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008597 void *data;
8598 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
Benjamin Petersonbac79492012-01-14 13:34:47 -05008600 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 return NULL;
8602 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008603 data = PyUnicode_DATA(unicode);
8604 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008605
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 /* Default to Latin-1 */
8607 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610 /* allocate enough for a simple encoding without
8611 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008612 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 if (res == NULL)
8614 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008615 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008619 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008621 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 if (x==enc_EXCEPTION) /* error */
8623 goto onError;
8624 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008625 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008627 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 &res, &respos)) {
8629 goto onError;
8630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 else
8633 /* done with this character => adjust input position */
8634 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008638 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008639 if (_PyBytes_Resize(&res, respos) < 0)
8640 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008643 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 return res;
8645
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 Py_XDECREF(res);
8648 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008649 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 return NULL;
8651}
8652
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008653/* Deprecated */
8654PyObject *
8655PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8656 Py_ssize_t size,
8657 PyObject *mapping,
8658 const char *errors)
8659{
8660 PyObject *result;
8661 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8662 if (unicode == NULL)
8663 return NULL;
8664 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8665 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008666 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008667}
8668
Alexander Belopolsky40018472011-02-26 01:02:56 +00008669PyObject *
8670PyUnicode_AsCharmapString(PyObject *unicode,
8671 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672{
8673 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 PyErr_BadArgument();
8675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008677 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678}
8679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008681static void
8682make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008684 Py_ssize_t startpos, Py_ssize_t endpos,
8685 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 *exceptionObject = _PyUnicodeTranslateError_Create(
8689 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 }
8691 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8693 goto onError;
8694 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8695 goto onError;
8696 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8697 goto onError;
8698 return;
8699 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008700 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 }
8702}
8703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704/* error handling callback helper:
8705 build arguments, call the callback and check the arguments,
8706 put the result into newpos and return the replacement string, which
8707 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008708static PyObject *
8709unicode_translate_call_errorhandler(const char *errors,
8710 PyObject **errorHandler,
8711 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008713 Py_ssize_t startpos, Py_ssize_t endpos,
8714 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008716 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008718 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 PyObject *restuple;
8720 PyObject *resunicode;
8721
8722 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008726 }
8727
8728 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732
8733 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008738 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 Py_DECREF(restuple);
8740 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 }
8742 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 &resunicode, &i_newpos)) {
8744 Py_DECREF(restuple);
8745 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008747 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008749 else
8750 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008752 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 Py_DECREF(restuple);
8754 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756 Py_INCREF(resunicode);
8757 Py_DECREF(restuple);
8758 return resunicode;
8759}
8760
8761/* Lookup the character ch in the mapping and put the result in result,
8762 which must be decrefed by the caller.
8763 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008764static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766{
Christian Heimes217cfd12007-12-02 14:31:20 +00008767 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768 PyObject *x;
8769
8770 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 x = PyObject_GetItem(mapping, w);
8773 Py_DECREF(w);
8774 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8776 /* No mapping found means: use 1:1 mapping. */
8777 PyErr_Clear();
8778 *result = NULL;
8779 return 0;
8780 } else
8781 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 }
8783 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 *result = x;
8785 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008787 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008789 if (value < 0 || value > MAX_UNICODE) {
8790 PyErr_Format(PyExc_ValueError,
8791 "character mapping must be in range(0x%x)",
8792 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 Py_DECREF(x);
8794 return -1;
8795 }
8796 *result = x;
8797 return 0;
8798 }
8799 else if (PyUnicode_Check(x)) {
8800 *result = x;
8801 return 0;
8802 }
8803 else {
8804 /* wrong return value */
8805 PyErr_SetString(PyExc_TypeError,
8806 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008807 Py_DECREF(x);
8808 return -1;
8809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008810}
Victor Stinner1194ea02014-04-04 19:37:40 +02008811
8812/* lookup the character, write the result into the writer.
8813 Return 1 if the result was written into the writer, return 0 if the mapping
8814 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008815static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008816charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8817 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818{
Victor Stinner1194ea02014-04-04 19:37:40 +02008819 PyObject *item;
8820
8821 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008823
8824 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008826 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008829 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008830 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008831
8832 if (item == Py_None) {
8833 Py_DECREF(item);
8834 return 0;
8835 }
8836
8837 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008838 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8839 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8840 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008841 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8842 Py_DECREF(item);
8843 return -1;
8844 }
8845 Py_DECREF(item);
8846 return 1;
8847 }
8848
8849 if (!PyUnicode_Check(item)) {
8850 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008852 }
8853
8854 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8855 Py_DECREF(item);
8856 return -1;
8857 }
8858
8859 Py_DECREF(item);
8860 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008861}
8862
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863static int
8864unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8865 Py_UCS1 *translate)
8866{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008867 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 int ret = 0;
8869
Victor Stinner89a76ab2014-04-05 11:44:04 +02008870 if (charmaptranslate_lookup(ch, mapping, &item)) {
8871 return -1;
8872 }
8873
8874 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008875 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008876 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008878 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 /* not found => default to 1:1 mapping */
8880 translate[ch] = ch;
8881 return 1;
8882 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008883 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008884 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008885 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8886 used it */
8887 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 /* invalid character or character outside ASCII:
8889 skip the fast translate */
8890 goto exit;
8891 }
8892 translate[ch] = (Py_UCS1)replace;
8893 }
8894 else if (PyUnicode_Check(item)) {
8895 Py_UCS4 replace;
8896
8897 if (PyUnicode_READY(item) == -1) {
8898 Py_DECREF(item);
8899 return -1;
8900 }
8901 if (PyUnicode_GET_LENGTH(item) != 1)
8902 goto exit;
8903
8904 replace = PyUnicode_READ_CHAR(item, 0);
8905 if (replace > 127)
8906 goto exit;
8907 translate[ch] = (Py_UCS1)replace;
8908 }
8909 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008910 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911 goto exit;
8912 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913 ret = 1;
8914
Benjamin Peterson1365de72014-04-07 20:15:41 -04008915 exit:
8916 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 return ret;
8918}
8919
8920/* Fast path for ascii => ascii translation. Return 1 if the whole string
8921 was translated into writer, return 0 if the input string was partially
8922 translated into writer, raise an exception and return -1 on error. */
8923static int
8924unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008925 _PyUnicodeWriter *writer, int ignore,
8926 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008927{
Victor Stinner872b2912014-04-05 14:27:07 +02008928 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929 Py_ssize_t len;
8930 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008931 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008932
Victor Stinner89a76ab2014-04-05 11:44:04 +02008933 len = PyUnicode_GET_LENGTH(input);
8934
Victor Stinner872b2912014-04-05 14:27:07 +02008935 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008936
8937 in = PyUnicode_1BYTE_DATA(input);
8938 end = in + len;
8939
8940 assert(PyUnicode_IS_ASCII(writer->buffer));
8941 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8942 out = PyUnicode_1BYTE_DATA(writer->buffer);
8943
Victor Stinner872b2912014-04-05 14:27:07 +02008944 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008945 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008946 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008947 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008948 int translate = unicode_fast_translate_lookup(mapping, ch,
8949 ascii_table);
8950 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008951 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008952 if (translate == 0)
8953 goto exit;
8954 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008955 }
Victor Stinner872b2912014-04-05 14:27:07 +02008956 if (ch2 == 0xfe) {
8957 if (ignore)
8958 continue;
8959 goto exit;
8960 }
8961 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008962 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008963 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008964 }
Victor Stinner872b2912014-04-05 14:27:07 +02008965 res = 1;
8966
8967exit:
8968 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008969 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008970 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008971}
8972
Victor Stinner3222da22015-10-01 22:07:32 +02008973static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974_PyUnicode_TranslateCharmap(PyObject *input,
8975 PyObject *mapping,
8976 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 Py_ssize_t size, i;
8981 int kind;
8982 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 _PyUnicodeWriter writer;
8984 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 char *reason = "character maps to <undefined>";
8986 PyObject *errorHandler = NULL;
8987 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008989 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008990
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 PyErr_BadArgument();
8993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 if (PyUnicode_READY(input) == -1)
8997 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008998 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 kind = PyUnicode_KIND(input);
9000 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009002 if (size == 0)
9003 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009005 /* allocate enough for a simple 1:1 translation without
9006 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 _PyUnicodeWriter_Init(&writer);
9008 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010
Victor Stinner872b2912014-04-05 14:27:07 +02009011 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9012
Victor Stinner33798672016-03-01 21:59:58 +01009013 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009015 if (PyUnicode_IS_ASCII(input)) {
9016 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9017 if (res < 0) {
9018 _PyUnicodeWriter_Dealloc(&writer);
9019 return NULL;
9020 }
9021 if (res == 1)
9022 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009023 }
Victor Stinner33798672016-03-01 21:59:58 +01009024 else {
9025 i = 0;
9026 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009030 int translate;
9031 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9032 Py_ssize_t newpos;
9033 /* startpos for collecting untranslatable chars */
9034 Py_ssize_t collstart;
9035 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009036 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037
Victor Stinner1194ea02014-04-04 19:37:40 +02009038 ch = PyUnicode_READ(kind, data, i);
9039 translate = charmaptranslate_output(ch, mapping, &writer);
9040 if (translate < 0)
9041 goto onError;
9042
9043 if (translate != 0) {
9044 /* it worked => adjust input pointer */
9045 ++i;
9046 continue;
9047 }
9048
9049 /* untranslatable character */
9050 collstart = i;
9051 collend = i+1;
9052
9053 /* find all untranslatable characters */
9054 while (collend < size) {
9055 PyObject *x;
9056 ch = PyUnicode_READ(kind, data, collend);
9057 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009058 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 Py_XDECREF(x);
9060 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009062 ++collend;
9063 }
9064
9065 if (ignore) {
9066 i = collend;
9067 }
9068 else {
9069 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9070 reason, input, &exc,
9071 collstart, collend, &newpos);
9072 if (repunicode == NULL)
9073 goto onError;
9074 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009076 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009077 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009078 Py_DECREF(repunicode);
9079 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009080 }
9081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009082 Py_XDECREF(exc);
9083 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009084 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009087 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009088 Py_XDECREF(exc);
9089 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 return NULL;
9091}
9092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093/* Deprecated. Use PyUnicode_Translate instead. */
9094PyObject *
9095PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9096 Py_ssize_t size,
9097 PyObject *mapping,
9098 const char *errors)
9099{
Christian Heimes5f520f42012-09-11 14:03:25 +02009100 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9102 if (!unicode)
9103 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009104 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9105 Py_DECREF(unicode);
9106 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107}
9108
Alexander Belopolsky40018472011-02-26 01:02:56 +00009109PyObject *
9110PyUnicode_Translate(PyObject *str,
9111 PyObject *mapping,
9112 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009114 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009115 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009116 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117}
Tim Petersced69f82003-09-16 20:30:58 +00009118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009120fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121{
9122 /* No need to call PyUnicode_READY(self) because this function is only
9123 called as a callback from fixup() which does it already. */
9124 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9125 const int kind = PyUnicode_KIND(self);
9126 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009127 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009128 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129 Py_ssize_t i;
9130
9131 for (i = 0; i < len; ++i) {
9132 ch = PyUnicode_READ(kind, data, i);
9133 fixed = 0;
9134 if (ch > 127) {
9135 if (Py_UNICODE_ISSPACE(ch))
9136 fixed = ' ';
9137 else {
9138 const int decimal = Py_UNICODE_TODECIMAL(ch);
9139 if (decimal >= 0)
9140 fixed = '0' + decimal;
9141 }
9142 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009143 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009144 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 PyUnicode_WRITE(kind, data, i, fixed);
9146 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009147 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009148 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 }
9151
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009152 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153}
9154
9155PyObject *
9156_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9157{
9158 if (!PyUnicode_Check(unicode)) {
9159 PyErr_BadInternalCall();
9160 return NULL;
9161 }
9162 if (PyUnicode_READY(unicode) == -1)
9163 return NULL;
9164 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9165 /* If the string is already ASCII, just return the same string */
9166 Py_INCREF(unicode);
9167 return unicode;
9168 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009169 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170}
9171
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009172PyObject *
9173PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9174 Py_ssize_t length)
9175{
Victor Stinnerf0124502011-11-21 23:12:56 +01009176 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009177 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009178 Py_UCS4 maxchar;
9179 enum PyUnicode_Kind kind;
9180 void *data;
9181
Victor Stinner99d7ad02012-02-22 13:37:39 +01009182 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009183 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009184 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009185 if (ch > 127) {
9186 int decimal = Py_UNICODE_TODECIMAL(ch);
9187 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009188 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009189 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009190 }
9191 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009192
9193 /* Copy to a new string */
9194 decimal = PyUnicode_New(length, maxchar);
9195 if (decimal == NULL)
9196 return decimal;
9197 kind = PyUnicode_KIND(decimal);
9198 data = PyUnicode_DATA(decimal);
9199 /* Iterate over code points */
9200 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009201 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009202 if (ch > 127) {
9203 int decimal = Py_UNICODE_TODECIMAL(ch);
9204 if (decimal >= 0)
9205 ch = '0' + decimal;
9206 }
9207 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009209 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009210}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009211/* --- Decimal Encoder ---------------------------------------------------- */
9212
Alexander Belopolsky40018472011-02-26 01:02:56 +00009213int
9214PyUnicode_EncodeDecimal(Py_UNICODE *s,
9215 Py_ssize_t length,
9216 char *output,
9217 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009218{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009219 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009220 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009221 enum PyUnicode_Kind kind;
9222 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009223
9224 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009225 PyErr_BadArgument();
9226 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009227 }
9228
Victor Stinner42bf7752011-11-21 22:52:58 +01009229 unicode = PyUnicode_FromUnicode(s, length);
9230 if (unicode == NULL)
9231 return -1;
9232
Benjamin Petersonbac79492012-01-14 13:34:47 -05009233 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009234 Py_DECREF(unicode);
9235 return -1;
9236 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009237 kind = PyUnicode_KIND(unicode);
9238 data = PyUnicode_DATA(unicode);
9239
Victor Stinnerb84d7232011-11-22 01:50:07 +01009240 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009241 PyObject *exc;
9242 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009244 Py_ssize_t startpos;
9245
9246 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009247
Benjamin Peterson29060642009-01-31 22:14:21 +00009248 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009249 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009250 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009251 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009253 decimal = Py_UNICODE_TODECIMAL(ch);
9254 if (decimal >= 0) {
9255 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009256 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009257 continue;
9258 }
9259 if (0 < ch && ch < 256) {
9260 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009261 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009262 continue;
9263 }
Victor Stinner6345be92011-11-25 20:09:01 +01009264
Victor Stinner42bf7752011-11-21 22:52:58 +01009265 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009266 exc = NULL;
9267 raise_encode_exception(&exc, "decimal", unicode,
9268 startpos, startpos+1,
9269 "invalid decimal Unicode string");
9270 Py_XDECREF(exc);
9271 Py_DECREF(unicode);
9272 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009273 }
9274 /* 0-terminate the output string */
9275 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009276 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009277 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009278}
9279
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280/* --- Helpers ------------------------------------------------------------ */
9281
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009282/* helper macro to fixup start/end slice values */
9283#define ADJUST_INDICES(start, end, len) \
9284 if (end > len) \
9285 end = len; \
9286 else if (end < 0) { \
9287 end += len; \
9288 if (end < 0) \
9289 end = 0; \
9290 } \
9291 if (start < 0) { \
9292 start += len; \
9293 if (start < 0) \
9294 start = 0; \
9295 }
9296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009298any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009300 Py_ssize_t end,
9301 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009303 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 void *buf1, *buf2;
9305 Py_ssize_t len1, len2, result;
9306
9307 kind1 = PyUnicode_KIND(s1);
9308 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009309 if (kind1 < kind2)
9310 return -1;
9311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 len1 = PyUnicode_GET_LENGTH(s1);
9313 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009314 ADJUST_INDICES(start, end, len1);
9315 if (end - start < len2)
9316 return -1;
9317
9318 buf1 = PyUnicode_DATA(s1);
9319 buf2 = PyUnicode_DATA(s2);
9320 if (len2 == 1) {
9321 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9322 result = findchar((const char *)buf1 + kind1*start,
9323 kind1, end - start, ch, direction);
9324 if (result == -1)
9325 return -1;
9326 else
9327 return start + result;
9328 }
9329
9330 if (kind2 != kind1) {
9331 buf2 = _PyUnicode_AsKind(s2, kind1);
9332 if (!buf2)
9333 return -2;
9334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335
Victor Stinner794d5672011-10-10 03:21:36 +02009336 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009337 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009338 case PyUnicode_1BYTE_KIND:
9339 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9340 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9341 else
9342 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9343 break;
9344 case PyUnicode_2BYTE_KIND:
9345 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9346 break;
9347 case PyUnicode_4BYTE_KIND:
9348 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9349 break;
9350 default:
9351 assert(0); result = -2;
9352 }
9353 }
9354 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009355 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009356 case PyUnicode_1BYTE_KIND:
9357 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9358 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9359 else
9360 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9361 break;
9362 case PyUnicode_2BYTE_KIND:
9363 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9364 break;
9365 case PyUnicode_4BYTE_KIND:
9366 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9367 break;
9368 default:
9369 assert(0); result = -2;
9370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 }
9372
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009373 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 PyMem_Free(buf2);
9375
9376 return result;
9377}
9378
9379Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009380_PyUnicode_InsertThousandsGrouping(
9381 PyObject *unicode, Py_ssize_t index,
9382 Py_ssize_t n_buffer,
9383 void *digits, Py_ssize_t n_digits,
9384 Py_ssize_t min_width,
9385 const char *grouping, PyObject *thousands_sep,
9386 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387{
Victor Stinner41a863c2012-02-24 00:37:51 +01009388 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009389 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009390 Py_ssize_t thousands_sep_len;
9391 Py_ssize_t len;
9392
9393 if (unicode != NULL) {
9394 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009395 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009396 }
9397 else {
9398 kind = PyUnicode_1BYTE_KIND;
9399 data = NULL;
9400 }
9401 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9402 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9403 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9404 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009405 if (thousands_sep_kind < kind) {
9406 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9407 if (!thousands_sep_data)
9408 return -1;
9409 }
9410 else {
9411 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9412 if (!data)
9413 return -1;
9414 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009415 }
9416
Benjamin Petersonead6b532011-12-20 17:23:42 -06009417 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009419 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009420 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009421 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009422 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009423 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009424 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009425 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009426 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009427 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009428 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009429 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009431 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009432 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009433 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009434 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009435 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009437 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009438 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009439 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009440 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009441 break;
9442 default:
9443 assert(0);
9444 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009446 if (unicode != NULL && thousands_sep_kind != kind) {
9447 if (thousands_sep_kind < kind)
9448 PyMem_Free(thousands_sep_data);
9449 else
9450 PyMem_Free(data);
9451 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009452 if (unicode == NULL) {
9453 *maxchar = 127;
9454 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009455 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009456 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009457 }
9458 }
9459 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460}
9461
9462
Alexander Belopolsky40018472011-02-26 01:02:56 +00009463Py_ssize_t
9464PyUnicode_Count(PyObject *str,
9465 PyObject *substr,
9466 Py_ssize_t start,
9467 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009469 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009470 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 void *buf1 = NULL, *buf2 = NULL;
9472 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009473
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009474 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009476
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009477 kind1 = PyUnicode_KIND(str);
9478 kind2 = PyUnicode_KIND(substr);
9479 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009480 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009481
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009482 len1 = PyUnicode_GET_LENGTH(str);
9483 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009485 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009486 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009487
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009488 buf1 = PyUnicode_DATA(str);
9489 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009490 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009491 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009492 if (!buf2)
9493 goto onError;
9494 }
9495
9496 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009498 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009499 result = asciilib_count(
9500 ((Py_UCS1*)buf1) + start, end - start,
9501 buf2, len2, PY_SSIZE_T_MAX
9502 );
9503 else
9504 result = ucs1lib_count(
9505 ((Py_UCS1*)buf1) + start, end - start,
9506 buf2, len2, PY_SSIZE_T_MAX
9507 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 break;
9509 case PyUnicode_2BYTE_KIND:
9510 result = ucs2lib_count(
9511 ((Py_UCS2*)buf1) + start, end - start,
9512 buf2, len2, PY_SSIZE_T_MAX
9513 );
9514 break;
9515 case PyUnicode_4BYTE_KIND:
9516 result = ucs4lib_count(
9517 ((Py_UCS4*)buf1) + start, end - start,
9518 buf2, len2, PY_SSIZE_T_MAX
9519 );
9520 break;
9521 default:
9522 assert(0); result = 0;
9523 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009524
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009525 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 PyMem_Free(buf2);
9527
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009530 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 PyMem_Free(buf2);
9532 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533}
9534
Alexander Belopolsky40018472011-02-26 01:02:56 +00009535Py_ssize_t
9536PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009537 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009538 Py_ssize_t start,
9539 Py_ssize_t end,
9540 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009542 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009544
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009545 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546}
9547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548Py_ssize_t
9549PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9550 Py_ssize_t start, Py_ssize_t end,
9551 int direction)
9552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009554 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 if (PyUnicode_READY(str) == -1)
9556 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009557 if (start < 0 || end < 0) {
9558 PyErr_SetString(PyExc_IndexError, "string index out of range");
9559 return -2;
9560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 if (end > PyUnicode_GET_LENGTH(str))
9562 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009563 if (start >= end)
9564 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009566 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9567 kind, end-start, ch, direction);
9568 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009570 else
9571 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572}
9573
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009575tailmatch(PyObject *self,
9576 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577 Py_ssize_t start,
9578 Py_ssize_t end,
9579 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 int kind_self;
9582 int kind_sub;
9583 void *data_self;
9584 void *data_sub;
9585 Py_ssize_t offset;
9586 Py_ssize_t i;
9587 Py_ssize_t end_sub;
9588
9589 if (PyUnicode_READY(self) == -1 ||
9590 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009591 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9594 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009596 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009598 if (PyUnicode_GET_LENGTH(substring) == 0)
9599 return 1;
9600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 kind_self = PyUnicode_KIND(self);
9602 data_self = PyUnicode_DATA(self);
9603 kind_sub = PyUnicode_KIND(substring);
9604 data_sub = PyUnicode_DATA(substring);
9605 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9606
9607 if (direction > 0)
9608 offset = end;
9609 else
9610 offset = start;
9611
9612 if (PyUnicode_READ(kind_self, data_self, offset) ==
9613 PyUnicode_READ(kind_sub, data_sub, 0) &&
9614 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9615 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9616 /* If both are of the same kind, memcmp is sufficient */
9617 if (kind_self == kind_sub) {
9618 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009619 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 data_sub,
9621 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009622 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009624 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 else {
9626 /* We do not need to compare 0 and len(substring)-1 because
9627 the if statement above ensured already that they are equal
9628 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 for (i = 1; i < end_sub; ++i) {
9630 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9631 PyUnicode_READ(kind_sub, data_sub, i))
9632 return 0;
9633 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 }
9637
9638 return 0;
9639}
9640
Alexander Belopolsky40018472011-02-26 01:02:56 +00009641Py_ssize_t
9642PyUnicode_Tailmatch(PyObject *str,
9643 PyObject *substr,
9644 Py_ssize_t start,
9645 Py_ssize_t end,
9646 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009648 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009650
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009651 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654/* Apply fixfct filter to the Unicode object self and return a
9655 reference to the modified object */
9656
Alexander Belopolsky40018472011-02-26 01:02:56 +00009657static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009658fixup(PyObject *self,
9659 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 PyObject *u;
9662 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009663 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009665 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009668 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 /* fix functions return the new maximum character in a string,
9671 if the kind of the resulting unicode object does not change,
9672 everything is fine. Otherwise we need to change the string kind
9673 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009674 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009675
9676 if (maxchar_new == 0) {
9677 /* no changes */;
9678 if (PyUnicode_CheckExact(self)) {
9679 Py_DECREF(u);
9680 Py_INCREF(self);
9681 return self;
9682 }
9683 else
9684 return u;
9685 }
9686
Victor Stinnere6abb482012-05-02 01:15:40 +02009687 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688
Victor Stinnereaab6042011-12-11 22:22:39 +01009689 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009691
9692 /* In case the maximum character changed, we need to
9693 convert the string to the new category. */
9694 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9695 if (v == NULL) {
9696 Py_DECREF(u);
9697 return NULL;
9698 }
9699 if (maxchar_new > maxchar_old) {
9700 /* If the maxchar increased so that the kind changed, not all
9701 characters are representable anymore and we need to fix the
9702 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009703 _PyUnicode_FastCopyCharacters(v, 0,
9704 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009705 maxchar_old = fixfct(v);
9706 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 }
9708 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009709 _PyUnicode_FastCopyCharacters(v, 0,
9710 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009712 Py_DECREF(u);
9713 assert(_PyUnicode_CheckConsistency(v, 1));
9714 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715}
9716
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717static PyObject *
9718ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9721 char *resdata, *data = PyUnicode_DATA(self);
9722 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009723
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724 res = PyUnicode_New(len, 127);
9725 if (res == NULL)
9726 return NULL;
9727 resdata = PyUnicode_DATA(res);
9728 if (lower)
9729 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 _Py_bytes_upper(resdata, data, len);
9732 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733}
9734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738 Py_ssize_t j;
9739 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009740 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009742
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009743 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9744
9745 where ! is a negation and \p{xxx} is a character with property xxx.
9746 */
9747 for (j = i - 1; j >= 0; j--) {
9748 c = PyUnicode_READ(kind, data, j);
9749 if (!_PyUnicode_IsCaseIgnorable(c))
9750 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9753 if (final_sigma) {
9754 for (j = i + 1; j < length; j++) {
9755 c = PyUnicode_READ(kind, data, j);
9756 if (!_PyUnicode_IsCaseIgnorable(c))
9757 break;
9758 }
9759 final_sigma = j == length || !_PyUnicode_IsCased(c);
9760 }
9761 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762}
9763
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009764static int
9765lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9766 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768 /* Obscure special case. */
9769 if (c == 0x3A3) {
9770 mapped[0] = handle_capital_sigma(kind, data, length, i);
9771 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774}
9775
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009776static Py_ssize_t
9777do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009779 Py_ssize_t i, k = 0;
9780 int n_res, j;
9781 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009782
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009783 c = PyUnicode_READ(kind, data, 0);
9784 n_res = _PyUnicode_ToUpperFull(c, mapped);
9785 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009786 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009787 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009789 for (i = 1; i < length; i++) {
9790 c = PyUnicode_READ(kind, data, i);
9791 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9792 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009793 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009794 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009795 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009796 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009797 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798}
9799
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009800static Py_ssize_t
9801do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9802 Py_ssize_t i, k = 0;
9803
9804 for (i = 0; i < length; i++) {
9805 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9806 int n_res, j;
9807 if (Py_UNICODE_ISUPPER(c)) {
9808 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809 }
9810 else if (Py_UNICODE_ISLOWER(c)) {
9811 n_res = _PyUnicode_ToUpperFull(c, mapped);
9812 }
9813 else {
9814 n_res = 1;
9815 mapped[0] = c;
9816 }
9817 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009818 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819 res[k++] = mapped[j];
9820 }
9821 }
9822 return k;
9823}
9824
9825static Py_ssize_t
9826do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9827 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009829 Py_ssize_t i, k = 0;
9830
9831 for (i = 0; i < length; i++) {
9832 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9833 int n_res, j;
9834 if (lower)
9835 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9836 else
9837 n_res = _PyUnicode_ToUpperFull(c, mapped);
9838 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009839 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009840 res[k++] = mapped[j];
9841 }
9842 }
9843 return k;
9844}
9845
9846static Py_ssize_t
9847do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9848{
9849 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9850}
9851
9852static Py_ssize_t
9853do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9854{
9855 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9856}
9857
Benjamin Petersone51757f2012-01-12 21:10:29 -05009858static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009859do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9860{
9861 Py_ssize_t i, k = 0;
9862
9863 for (i = 0; i < length; i++) {
9864 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9865 Py_UCS4 mapped[3];
9866 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9867 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009868 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009869 res[k++] = mapped[j];
9870 }
9871 }
9872 return k;
9873}
9874
9875static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009876do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9877{
9878 Py_ssize_t i, k = 0;
9879 int previous_is_cased;
9880
9881 previous_is_cased = 0;
9882 for (i = 0; i < length; i++) {
9883 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9884 Py_UCS4 mapped[3];
9885 int n_res, j;
9886
9887 if (previous_is_cased)
9888 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9889 else
9890 n_res = _PyUnicode_ToTitleFull(c, mapped);
9891
9892 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009893 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009894 res[k++] = mapped[j];
9895 }
9896
9897 previous_is_cased = _PyUnicode_IsCased(c);
9898 }
9899 return k;
9900}
9901
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009902static PyObject *
9903case_operation(PyObject *self,
9904 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9905{
9906 PyObject *res = NULL;
9907 Py_ssize_t length, newlength = 0;
9908 int kind, outkind;
9909 void *data, *outdata;
9910 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9911
Benjamin Petersoneea48462012-01-16 14:28:50 -05009912 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009913
9914 kind = PyUnicode_KIND(self);
9915 data = PyUnicode_DATA(self);
9916 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009917 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009918 PyErr_SetString(PyExc_OverflowError, "string is too long");
9919 return NULL;
9920 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009921 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009922 if (tmp == NULL)
9923 return PyErr_NoMemory();
9924 newlength = perform(kind, data, length, tmp, &maxchar);
9925 res = PyUnicode_New(newlength, maxchar);
9926 if (res == NULL)
9927 goto leave;
9928 tmpend = tmp + newlength;
9929 outdata = PyUnicode_DATA(res);
9930 outkind = PyUnicode_KIND(res);
9931 switch (outkind) {
9932 case PyUnicode_1BYTE_KIND:
9933 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9934 break;
9935 case PyUnicode_2BYTE_KIND:
9936 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9937 break;
9938 case PyUnicode_4BYTE_KIND:
9939 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9940 break;
9941 default:
9942 assert(0);
9943 break;
9944 }
9945 leave:
9946 PyMem_FREE(tmp);
9947 return res;
9948}
9949
Tim Peters8ce9f162004-08-27 01:49:32 +00009950PyObject *
9951PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009953 PyObject *res;
9954 PyObject *fseq;
9955 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009958 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009959 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009960 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009961 }
9962
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009963 /* NOTE: the following code can't call back into Python code,
9964 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009965 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009967 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009968 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009969 res = _PyUnicode_JoinArray(separator, items, seqlen);
9970 Py_DECREF(fseq);
9971 return res;
9972}
9973
9974PyObject *
9975_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9976{
9977 PyObject *res = NULL; /* the result */
9978 PyObject *sep = NULL;
9979 Py_ssize_t seplen;
9980 PyObject *item;
9981 Py_ssize_t sz, i, res_offset;
9982 Py_UCS4 maxchar;
9983 Py_UCS4 item_maxchar;
9984 int use_memcpy;
9985 unsigned char *res_data = NULL, *sep_data = NULL;
9986 PyObject *last_obj;
9987 unsigned int kind = 0;
9988
Tim Peters05eba1f2004-08-27 21:32:02 +00009989 /* If empty sequence, return u"". */
9990 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009991 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009992 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009993
Tim Peters05eba1f2004-08-27 21:32:02 +00009994 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009995 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009996 if (seqlen == 1) {
9997 if (PyUnicode_CheckExact(items[0])) {
9998 res = items[0];
9999 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010000 return res;
10001 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010002 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010003 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010004 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010005 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010006 /* Set up sep and seplen */
10007 if (separator == NULL) {
10008 /* fall back to a blank space separator */
10009 sep = PyUnicode_FromOrdinal(' ');
10010 if (!sep)
10011 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010012 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010013 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010014 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010015 else {
10016 if (!PyUnicode_Check(separator)) {
10017 PyErr_Format(PyExc_TypeError,
10018 "separator: expected str instance,"
10019 " %.80s found",
10020 Py_TYPE(separator)->tp_name);
10021 goto onError;
10022 }
10023 if (PyUnicode_READY(separator))
10024 goto onError;
10025 sep = separator;
10026 seplen = PyUnicode_GET_LENGTH(separator);
10027 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10028 /* inc refcount to keep this code path symmetric with the
10029 above case of a blank separator */
10030 Py_INCREF(sep);
10031 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010032 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010033 }
10034
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010035 /* There are at least two things to join, or else we have a subclass
10036 * of str in the sequence.
10037 * Do a pre-pass to figure out the total amount of space we'll
10038 * need (sz), and see whether all argument are strings.
10039 */
10040 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010041#ifdef Py_DEBUG
10042 use_memcpy = 0;
10043#else
10044 use_memcpy = 1;
10045#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010046 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010047 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010048 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010049 if (!PyUnicode_Check(item)) {
10050 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +020010051 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +000010052 " %.80s found",
10053 i, Py_TYPE(item)->tp_name);
10054 goto onError;
10055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 if (PyUnicode_READY(item) == -1)
10057 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010058 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010060 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010061 if (i != 0) {
10062 add_sz += seplen;
10063 }
10064 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010065 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010066 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010067 goto onError;
10068 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010069 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 if (use_memcpy && last_obj != NULL) {
10071 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10072 use_memcpy = 0;
10073 }
10074 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010075 }
Tim Petersced69f82003-09-16 20:30:58 +000010076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010078 if (res == NULL)
10079 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010080
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010081 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010082#ifdef Py_DEBUG
10083 use_memcpy = 0;
10084#else
10085 if (use_memcpy) {
10086 res_data = PyUnicode_1BYTE_DATA(res);
10087 kind = PyUnicode_KIND(res);
10088 if (seplen != 0)
10089 sep_data = PyUnicode_1BYTE_DATA(sep);
10090 }
10091#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010092 if (use_memcpy) {
10093 for (i = 0; i < seqlen; ++i) {
10094 Py_ssize_t itemlen;
10095 item = items[i];
10096
10097 /* Copy item, and maybe the separator. */
10098 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010099 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010100 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010101 kind * seplen);
10102 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010103 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010104
10105 itemlen = PyUnicode_GET_LENGTH(item);
10106 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010107 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010108 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010109 kind * itemlen);
10110 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010111 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010112 }
10113 assert(res_data == PyUnicode_1BYTE_DATA(res)
10114 + kind * PyUnicode_GET_LENGTH(res));
10115 }
10116 else {
10117 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10118 Py_ssize_t itemlen;
10119 item = items[i];
10120
10121 /* Copy item, and maybe the separator. */
10122 if (i && seplen != 0) {
10123 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10124 res_offset += seplen;
10125 }
10126
10127 itemlen = PyUnicode_GET_LENGTH(item);
10128 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010129 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010130 res_offset += itemlen;
10131 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010132 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010133 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010134 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010137 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
Benjamin Peterson29060642009-01-31 22:14:21 +000010140 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010142 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143 return NULL;
10144}
10145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146#define FILL(kind, data, value, start, length) \
10147 do { \
10148 Py_ssize_t i_ = 0; \
10149 assert(kind != PyUnicode_WCHAR_KIND); \
10150 switch ((kind)) { \
10151 case PyUnicode_1BYTE_KIND: { \
10152 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010153 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 break; \
10155 } \
10156 case PyUnicode_2BYTE_KIND: { \
10157 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10158 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10159 break; \
10160 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010161 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10163 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10164 break; \
10165 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010166 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 } \
10168 } while (0)
10169
Victor Stinnerd3f08822012-05-29 12:57:52 +020010170void
10171_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10172 Py_UCS4 fill_char)
10173{
10174 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10175 const void *data = PyUnicode_DATA(unicode);
10176 assert(PyUnicode_IS_READY(unicode));
10177 assert(unicode_modifiable(unicode));
10178 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10179 assert(start >= 0);
10180 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10181 FILL(kind, data, fill_char, start, length);
10182}
10183
Victor Stinner3fe55312012-01-04 00:33:50 +010010184Py_ssize_t
10185PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10186 Py_UCS4 fill_char)
10187{
10188 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010189
10190 if (!PyUnicode_Check(unicode)) {
10191 PyErr_BadInternalCall();
10192 return -1;
10193 }
10194 if (PyUnicode_READY(unicode) == -1)
10195 return -1;
10196 if (unicode_check_modifiable(unicode))
10197 return -1;
10198
Victor Stinnerd3f08822012-05-29 12:57:52 +020010199 if (start < 0) {
10200 PyErr_SetString(PyExc_IndexError, "string index out of range");
10201 return -1;
10202 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010203 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10204 PyErr_SetString(PyExc_ValueError,
10205 "fill character is bigger than "
10206 "the string maximum character");
10207 return -1;
10208 }
10209
10210 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10211 length = Py_MIN(maxlen, length);
10212 if (length <= 0)
10213 return 0;
10214
Victor Stinnerd3f08822012-05-29 12:57:52 +020010215 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010216 return length;
10217}
10218
Victor Stinner9310abb2011-10-05 00:59:23 +020010219static PyObject *
10220pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010221 Py_ssize_t left,
10222 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 PyObject *u;
10226 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010227 int kind;
10228 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
10230 if (left < 0)
10231 left = 0;
10232 if (right < 0)
10233 right = 0;
10234
Victor Stinnerc4b49542011-12-11 22:44:26 +010010235 if (left == 0 && right == 0)
10236 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10239 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010240 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10241 return NULL;
10242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010244 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010246 if (!u)
10247 return NULL;
10248
10249 kind = PyUnicode_KIND(u);
10250 data = PyUnicode_DATA(u);
10251 if (left)
10252 FILL(kind, data, fill, 0, left);
10253 if (right)
10254 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010255 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010256 assert(_PyUnicode_CheckConsistency(u, 1));
10257 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258}
10259
Alexander Belopolsky40018472011-02-26 01:02:56 +000010260PyObject *
10261PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010265 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267
Benjamin Petersonead6b532011-12-20 17:23:42 -060010268 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010270 if (PyUnicode_IS_ASCII(string))
10271 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 PyUnicode_GET_LENGTH(string), keepends);
10274 else
10275 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010276 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010277 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 break;
10279 case PyUnicode_2BYTE_KIND:
10280 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010281 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 PyUnicode_GET_LENGTH(string), keepends);
10283 break;
10284 case PyUnicode_4BYTE_KIND:
10285 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 PyUnicode_GET_LENGTH(string), keepends);
10288 break;
10289 default:
10290 assert(0);
10291 list = 0;
10292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294}
10295
Alexander Belopolsky40018472011-02-26 01:02:56 +000010296static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010297split(PyObject *self,
10298 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010299 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010301 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 void *buf1, *buf2;
10303 Py_ssize_t len1, len2;
10304 PyObject* out;
10305
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010307 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (PyUnicode_READY(self) == -1)
10310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010313 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010315 if (PyUnicode_IS_ASCII(self))
10316 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010317 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010318 PyUnicode_GET_LENGTH(self), maxcount
10319 );
10320 else
10321 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010322 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 PyUnicode_GET_LENGTH(self), maxcount
10324 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 case PyUnicode_2BYTE_KIND:
10326 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 PyUnicode_GET_LENGTH(self), maxcount
10329 );
10330 case PyUnicode_4BYTE_KIND:
10331 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 PyUnicode_GET_LENGTH(self), maxcount
10334 );
10335 default:
10336 assert(0);
10337 return NULL;
10338 }
10339
10340 if (PyUnicode_READY(substring) == -1)
10341 return NULL;
10342
10343 kind1 = PyUnicode_KIND(self);
10344 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 len1 = PyUnicode_GET_LENGTH(self);
10346 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010347 if (kind1 < kind2 || len1 < len2) {
10348 out = PyList_New(1);
10349 if (out == NULL)
10350 return NULL;
10351 Py_INCREF(self);
10352 PyList_SET_ITEM(out, 0, self);
10353 return out;
10354 }
10355 buf1 = PyUnicode_DATA(self);
10356 buf2 = PyUnicode_DATA(substring);
10357 if (kind2 != kind1) {
10358 buf2 = _PyUnicode_AsKind(substring, kind1);
10359 if (!buf2)
10360 return NULL;
10361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010363 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010365 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10366 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010367 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010368 else
10369 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010370 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 break;
10372 case PyUnicode_2BYTE_KIND:
10373 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010374 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 break;
10376 case PyUnicode_4BYTE_KIND:
10377 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010378 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 break;
10380 default:
10381 out = NULL;
10382 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010383 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 PyMem_Free(buf2);
10385 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386}
10387
Alexander Belopolsky40018472011-02-26 01:02:56 +000010388static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010389rsplit(PyObject *self,
10390 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010391 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010392{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010393 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 void *buf1, *buf2;
10395 Py_ssize_t len1, len2;
10396 PyObject* out;
10397
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010398 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010399 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 if (PyUnicode_READY(self) == -1)
10402 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010405 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010407 if (PyUnicode_IS_ASCII(self))
10408 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010409 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010410 PyUnicode_GET_LENGTH(self), maxcount
10411 );
10412 else
10413 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010414 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010415 PyUnicode_GET_LENGTH(self), maxcount
10416 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 case PyUnicode_2BYTE_KIND:
10418 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010419 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 PyUnicode_GET_LENGTH(self), maxcount
10421 );
10422 case PyUnicode_4BYTE_KIND:
10423 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010424 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 PyUnicode_GET_LENGTH(self), maxcount
10426 );
10427 default:
10428 assert(0);
10429 return NULL;
10430 }
10431
10432 if (PyUnicode_READY(substring) == -1)
10433 return NULL;
10434
10435 kind1 = PyUnicode_KIND(self);
10436 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 len1 = PyUnicode_GET_LENGTH(self);
10438 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010439 if (kind1 < kind2 || len1 < len2) {
10440 out = PyList_New(1);
10441 if (out == NULL)
10442 return NULL;
10443 Py_INCREF(self);
10444 PyList_SET_ITEM(out, 0, self);
10445 return out;
10446 }
10447 buf1 = PyUnicode_DATA(self);
10448 buf2 = PyUnicode_DATA(substring);
10449 if (kind2 != kind1) {
10450 buf2 = _PyUnicode_AsKind(substring, kind1);
10451 if (!buf2)
10452 return NULL;
10453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010455 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010457 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10458 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010459 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010460 else
10461 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010462 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 break;
10464 case PyUnicode_2BYTE_KIND:
10465 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010466 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 break;
10468 case PyUnicode_4BYTE_KIND:
10469 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010470 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 break;
10472 default:
10473 out = NULL;
10474 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010475 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 PyMem_Free(buf2);
10477 return out;
10478}
10479
10480static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10482 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010484 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010486 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10487 return asciilib_find(buf1, len1, buf2, len2, offset);
10488 else
10489 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 case PyUnicode_2BYTE_KIND:
10491 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10492 case PyUnicode_4BYTE_KIND:
10493 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10494 }
10495 assert(0);
10496 return -1;
10497}
10498
10499static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010500anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10501 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010503 switch (kind) {
10504 case PyUnicode_1BYTE_KIND:
10505 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10506 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10507 else
10508 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10509 case PyUnicode_2BYTE_KIND:
10510 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10511 case PyUnicode_4BYTE_KIND:
10512 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10513 }
10514 assert(0);
10515 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010516}
10517
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518static void
10519replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10520 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10521{
10522 int kind = PyUnicode_KIND(u);
10523 void *data = PyUnicode_DATA(u);
10524 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10525 if (kind == PyUnicode_1BYTE_KIND) {
10526 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10527 (Py_UCS1 *)data + len,
10528 u1, u2, maxcount);
10529 }
10530 else if (kind == PyUnicode_2BYTE_KIND) {
10531 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10532 (Py_UCS2 *)data + len,
10533 u1, u2, maxcount);
10534 }
10535 else {
10536 assert(kind == PyUnicode_4BYTE_KIND);
10537 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10538 (Py_UCS4 *)data + len,
10539 u1, u2, maxcount);
10540 }
10541}
10542
Alexander Belopolsky40018472011-02-26 01:02:56 +000010543static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544replace(PyObject *self, PyObject *str1,
10545 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 PyObject *u;
10548 char *sbuf = PyUnicode_DATA(self);
10549 char *buf1 = PyUnicode_DATA(str1);
10550 char *buf2 = PyUnicode_DATA(str2);
10551 int srelease = 0, release1 = 0, release2 = 0;
10552 int skind = PyUnicode_KIND(self);
10553 int kind1 = PyUnicode_KIND(str1);
10554 int kind2 = PyUnicode_KIND(str2);
10555 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10556 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10557 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010558 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010559 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560
10561 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010562 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010564 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565
Victor Stinner59de0ee2011-10-07 10:01:28 +020010566 if (str1 == str2)
10567 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568
Victor Stinner49a0a212011-10-12 23:46:10 +020010569 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010570 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10571 if (maxchar < maxchar_str1)
10572 /* substring too wide to be present */
10573 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10575 /* Replacing str1 with str2 may cause a maxchar reduction in the
10576 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010577 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010578 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010581 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010583 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010585 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010587 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010588
Victor Stinner69ed0f42013-04-09 21:48:24 +020010589 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010590 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010591 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010593 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010595 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010597
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010598 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10599 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010600 }
10601 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 int rkind = skind;
10603 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010604 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (kind1 < rkind) {
10607 /* widen substring */
10608 buf1 = _PyUnicode_AsKind(str1, rkind);
10609 if (!buf1) goto error;
10610 release1 = 1;
10611 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010612 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 if (i < 0)
10614 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 if (rkind > kind2) {
10616 /* widen replacement */
10617 buf2 = _PyUnicode_AsKind(str2, rkind);
10618 if (!buf2) goto error;
10619 release2 = 1;
10620 }
10621 else if (rkind < kind2) {
10622 /* widen self and buf1 */
10623 rkind = kind2;
10624 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010625 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 sbuf = _PyUnicode_AsKind(self, rkind);
10627 if (!sbuf) goto error;
10628 srelease = 1;
10629 buf1 = _PyUnicode_AsKind(str1, rkind);
10630 if (!buf1) goto error;
10631 release1 = 1;
10632 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010633 u = PyUnicode_New(slen, maxchar);
10634 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 assert(PyUnicode_KIND(u) == rkind);
10637 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010638
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010640 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010641 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010643 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010645
10646 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010647 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010649 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010650 if (i == -1)
10651 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010652 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010658 }
10659 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010661 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 int rkind = skind;
10663 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010666 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf1 = _PyUnicode_AsKind(str1, rkind);
10668 if (!buf1) goto error;
10669 release1 = 1;
10670 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010671 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 if (n == 0)
10673 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010675 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 buf2 = _PyUnicode_AsKind(str2, rkind);
10677 if (!buf2) goto error;
10678 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010681 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 rkind = kind2;
10683 sbuf = _PyUnicode_AsKind(self, rkind);
10684 if (!sbuf) goto error;
10685 srelease = 1;
10686 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010687 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 buf1 = _PyUnicode_AsKind(str1, rkind);
10689 if (!buf1) goto error;
10690 release1 = 1;
10691 }
10692 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10693 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010694 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 PyErr_SetString(PyExc_OverflowError,
10696 "replace string is too long");
10697 goto error;
10698 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010699 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010700 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010701 _Py_INCREF_UNICODE_EMPTY();
10702 if (!unicode_empty)
10703 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010704 u = unicode_empty;
10705 goto done;
10706 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010707 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 PyErr_SetString(PyExc_OverflowError,
10709 "replace string is too long");
10710 goto error;
10711 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010712 u = PyUnicode_New(new_size, maxchar);
10713 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010715 assert(PyUnicode_KIND(u) == rkind);
10716 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 ires = i = 0;
10718 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010719 while (n-- > 0) {
10720 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010721 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010722 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010723 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010724 if (j == -1)
10725 break;
10726 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010727 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010728 memcpy(res + rkind * ires,
10729 sbuf + rkind * i,
10730 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 }
10733 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010735 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010737 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010743 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010744 memcpy(res + rkind * ires,
10745 sbuf + rkind * i,
10746 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010747 }
10748 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010749 /* interleave */
10750 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010751 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010753 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010755 if (--n <= 0)
10756 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010757 memcpy(res + rkind * ires,
10758 sbuf + rkind * i,
10759 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 ires++;
10761 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010762 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010763 memcpy(res + rkind * ires,
10764 sbuf + rkind * i,
10765 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010766 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010767 }
10768
10769 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010770 unicode_adjust_maxchar(&u);
10771 if (u == NULL)
10772 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010774
10775 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (srelease)
10777 PyMem_FREE(sbuf);
10778 if (release1)
10779 PyMem_FREE(buf1);
10780 if (release2)
10781 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010782 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010784
Benjamin Peterson29060642009-01-31 22:14:21 +000010785 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010786 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 if (srelease)
10788 PyMem_FREE(sbuf);
10789 if (release1)
10790 PyMem_FREE(buf1);
10791 if (release2)
10792 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010793 return unicode_result_unchanged(self);
10794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 error:
10796 if (srelease && sbuf)
10797 PyMem_FREE(sbuf);
10798 if (release1 && buf1)
10799 PyMem_FREE(buf1);
10800 if (release2 && buf2)
10801 PyMem_FREE(buf2);
10802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803}
10804
10805/* --- Unicode Object Methods --------------------------------------------- */
10806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010807PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010808 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809\n\
10810Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
10813static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010814unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010816 if (PyUnicode_READY(self) == -1)
10817 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010818 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819}
10820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010821PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010822 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823\n\
10824Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010825have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
10827static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010828unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010830 if (PyUnicode_READY(self) == -1)
10831 return NULL;
10832 if (PyUnicode_GET_LENGTH(self) == 0)
10833 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010834 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835}
10836
Benjamin Petersond5890c82012-01-14 13:23:30 -050010837PyDoc_STRVAR(casefold__doc__,
10838 "S.casefold() -> str\n\
10839\n\
10840Return a version of S suitable for caseless comparisons.");
10841
10842static PyObject *
10843unicode_casefold(PyObject *self)
10844{
10845 if (PyUnicode_READY(self) == -1)
10846 return NULL;
10847 if (PyUnicode_IS_ASCII(self))
10848 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010849 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010850}
10851
10852
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010853/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010854
10855static int
10856convert_uc(PyObject *obj, void *addr)
10857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010859
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010860 if (!PyUnicode_Check(obj)) {
10861 PyErr_Format(PyExc_TypeError,
10862 "The fill character must be a unicode character, "
10863 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010864 return 0;
10865 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010866 if (PyUnicode_READY(obj) < 0)
10867 return 0;
10868 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010869 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010871 return 0;
10872 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010873 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010874 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010875}
10876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010877PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010880Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010881done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
10883static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010884unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010886 Py_ssize_t marg, left;
10887 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 Py_UCS4 fillchar = ' ';
10889
Victor Stinnere9a29352011-10-01 02:14:59 +020010890 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892
Benjamin Petersonbac79492012-01-14 13:34:47 -050010893 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 return NULL;
10895
Victor Stinnerc4b49542011-12-11 22:44:26 +010010896 if (PyUnicode_GET_LENGTH(self) >= width)
10897 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
Victor Stinnerc4b49542011-12-11 22:44:26 +010010899 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900 left = marg / 2 + (marg & width & 1);
10901
Victor Stinner9310abb2011-10-05 00:59:23 +020010902 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903}
10904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905/* This function assumes that str1 and str2 are readied by the caller. */
10906
Marc-André Lemburge5034372000-08-08 08:04:29 +000010907static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010908unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010909{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010910#define COMPARE(TYPE1, TYPE2) \
10911 do { \
10912 TYPE1* p1 = (TYPE1 *)data1; \
10913 TYPE2* p2 = (TYPE2 *)data2; \
10914 TYPE1* end = p1 + len; \
10915 Py_UCS4 c1, c2; \
10916 for (; p1 != end; p1++, p2++) { \
10917 c1 = *p1; \
10918 c2 = *p2; \
10919 if (c1 != c2) \
10920 return (c1 < c2) ? -1 : 1; \
10921 } \
10922 } \
10923 while (0)
10924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 int kind1, kind2;
10926 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010927 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 kind1 = PyUnicode_KIND(str1);
10930 kind2 = PyUnicode_KIND(str2);
10931 data1 = PyUnicode_DATA(str1);
10932 data2 = PyUnicode_DATA(str2);
10933 len1 = PyUnicode_GET_LENGTH(str1);
10934 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010935 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010936
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010937 switch(kind1) {
10938 case PyUnicode_1BYTE_KIND:
10939 {
10940 switch(kind2) {
10941 case PyUnicode_1BYTE_KIND:
10942 {
10943 int cmp = memcmp(data1, data2, len);
10944 /* normalize result of memcmp() into the range [-1; 1] */
10945 if (cmp < 0)
10946 return -1;
10947 if (cmp > 0)
10948 return 1;
10949 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010950 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951 case PyUnicode_2BYTE_KIND:
10952 COMPARE(Py_UCS1, Py_UCS2);
10953 break;
10954 case PyUnicode_4BYTE_KIND:
10955 COMPARE(Py_UCS1, Py_UCS4);
10956 break;
10957 default:
10958 assert(0);
10959 }
10960 break;
10961 }
10962 case PyUnicode_2BYTE_KIND:
10963 {
10964 switch(kind2) {
10965 case PyUnicode_1BYTE_KIND:
10966 COMPARE(Py_UCS2, Py_UCS1);
10967 break;
10968 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010969 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010970 COMPARE(Py_UCS2, Py_UCS2);
10971 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010972 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010973 case PyUnicode_4BYTE_KIND:
10974 COMPARE(Py_UCS2, Py_UCS4);
10975 break;
10976 default:
10977 assert(0);
10978 }
10979 break;
10980 }
10981 case PyUnicode_4BYTE_KIND:
10982 {
10983 switch(kind2) {
10984 case PyUnicode_1BYTE_KIND:
10985 COMPARE(Py_UCS4, Py_UCS1);
10986 break;
10987 case PyUnicode_2BYTE_KIND:
10988 COMPARE(Py_UCS4, Py_UCS2);
10989 break;
10990 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010991 {
10992#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10993 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10994 /* normalize result of wmemcmp() into the range [-1; 1] */
10995 if (cmp < 0)
10996 return -1;
10997 if (cmp > 0)
10998 return 1;
10999#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011000 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011001#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011002 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011003 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011004 default:
11005 assert(0);
11006 }
11007 break;
11008 }
11009 default:
11010 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011011 }
11012
Victor Stinner770e19e2012-10-04 22:59:45 +020011013 if (len1 == len2)
11014 return 0;
11015 if (len1 < len2)
11016 return -1;
11017 else
11018 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011019
11020#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011021}
11022
Benjamin Peterson621b4302016-09-09 13:54:34 -070011023static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011024unicode_compare_eq(PyObject *str1, PyObject *str2)
11025{
11026 int kind;
11027 void *data1, *data2;
11028 Py_ssize_t len;
11029 int cmp;
11030
Victor Stinnere5567ad2012-10-23 02:48:49 +020011031 len = PyUnicode_GET_LENGTH(str1);
11032 if (PyUnicode_GET_LENGTH(str2) != len)
11033 return 0;
11034 kind = PyUnicode_KIND(str1);
11035 if (PyUnicode_KIND(str2) != kind)
11036 return 0;
11037 data1 = PyUnicode_DATA(str1);
11038 data2 = PyUnicode_DATA(str2);
11039
11040 cmp = memcmp(data1, data2, len * kind);
11041 return (cmp == 0);
11042}
11043
11044
Alexander Belopolsky40018472011-02-26 01:02:56 +000011045int
11046PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11049 if (PyUnicode_READY(left) == -1 ||
11050 PyUnicode_READY(right) == -1)
11051 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011052
11053 /* a string is equal to itself */
11054 if (left == right)
11055 return 0;
11056
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011057 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011059 PyErr_Format(PyExc_TypeError,
11060 "Can't compare %.100s and %.100s",
11061 left->ob_type->tp_name,
11062 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 return -1;
11064}
11065
Martin v. Löwis5b222132007-06-10 09:51:05 +000011066int
11067PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 Py_ssize_t i;
11070 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011072 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073
Victor Stinner910337b2011-10-03 03:20:16 +020011074 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011075 if (!PyUnicode_IS_READY(uni)) {
11076 const wchar_t *ws = _PyUnicode_WSTR(uni);
11077 /* Compare Unicode string and source character set string */
11078 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11079 if (chr != ustr[i])
11080 return (chr < ustr[i]) ? -1 : 1;
11081 }
11082 /* This check keeps Python strings that end in '\0' from comparing equal
11083 to C strings identical up to that point. */
11084 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11085 return 1; /* uni is longer */
11086 if (ustr[i])
11087 return -1; /* str is longer */
11088 return 0;
11089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011091 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011092 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011093 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011094 size_t len, len2 = strlen(str);
11095 int cmp;
11096
11097 len = Py_MIN(len1, len2);
11098 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011099 if (cmp != 0) {
11100 if (cmp < 0)
11101 return -1;
11102 else
11103 return 1;
11104 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011105 if (len1 > len2)
11106 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011107 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011108 return -1; /* str is longer */
11109 return 0;
11110 }
11111 else {
11112 void *data = PyUnicode_DATA(uni);
11113 /* Compare Unicode string and source character set string */
11114 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011115 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011116 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11117 /* This check keeps Python strings that end in '\0' from comparing equal
11118 to C strings identical up to that point. */
11119 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11120 return 1; /* uni is longer */
11121 if (str[i])
11122 return -1; /* str is longer */
11123 return 0;
11124 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011125}
11126
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011127static int
11128non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11129{
11130 size_t i, len;
11131 const wchar_t *p;
11132 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11133 if (strlen(str) != len)
11134 return 0;
11135 p = _PyUnicode_WSTR(unicode);
11136 assert(p);
11137 for (i = 0; i < len; i++) {
11138 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011139 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011140 return 0;
11141 }
11142 return 1;
11143}
11144
11145int
11146_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11147{
11148 size_t len;
11149 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011150 assert(str);
11151#ifndef NDEBUG
11152 for (const char *p = str; *p; p++) {
11153 assert((unsigned char)*p < 128);
11154 }
11155#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011156 if (PyUnicode_READY(unicode) == -1) {
11157 /* Memory error or bad data */
11158 PyErr_Clear();
11159 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11160 }
11161 if (!PyUnicode_IS_ASCII(unicode))
11162 return 0;
11163 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11164 return strlen(str) == len &&
11165 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11166}
11167
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011168int
11169_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11170{
11171 PyObject *right_uni;
11172 Py_hash_t hash;
11173
11174 assert(_PyUnicode_CHECK(left));
11175 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011176#ifndef NDEBUG
11177 for (const char *p = right->string; *p; p++) {
11178 assert((unsigned char)*p < 128);
11179 }
11180#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011181
11182 if (PyUnicode_READY(left) == -1) {
11183 /* memory error or bad data */
11184 PyErr_Clear();
11185 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11186 }
11187
11188 if (!PyUnicode_IS_ASCII(left))
11189 return 0;
11190
11191 right_uni = _PyUnicode_FromId(right); /* borrowed */
11192 if (right_uni == NULL) {
11193 /* memory error or bad data */
11194 PyErr_Clear();
11195 return _PyUnicode_EqualToASCIIString(left, right->string);
11196 }
11197
11198 if (left == right_uni)
11199 return 1;
11200
11201 if (PyUnicode_CHECK_INTERNED(left))
11202 return 0;
11203
11204 assert(_PyUnicode_HASH(right_uni) != 1);
11205 hash = _PyUnicode_HASH(left);
11206 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11207 return 0;
11208
11209 return unicode_compare_eq(left, right_uni);
11210}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011211
Benjamin Peterson29060642009-01-31 22:14:21 +000011212#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011213 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011214
Alexander Belopolsky40018472011-02-26 01:02:56 +000011215PyObject *
11216PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011217{
11218 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011219 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011220
Victor Stinnere5567ad2012-10-23 02:48:49 +020011221 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11222 Py_RETURN_NOTIMPLEMENTED;
11223
11224 if (PyUnicode_READY(left) == -1 ||
11225 PyUnicode_READY(right) == -1)
11226 return NULL;
11227
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011228 if (left == right) {
11229 switch (op) {
11230 case Py_EQ:
11231 case Py_LE:
11232 case Py_GE:
11233 /* a string is equal to itself */
11234 v = Py_True;
11235 break;
11236 case Py_NE:
11237 case Py_LT:
11238 case Py_GT:
11239 v = Py_False;
11240 break;
11241 default:
11242 PyErr_BadArgument();
11243 return NULL;
11244 }
11245 }
11246 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011247 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011248 result ^= (op == Py_NE);
11249 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011250 }
11251 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011252 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011253
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011254 /* Convert the return value to a Boolean */
11255 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011256 case Py_LE:
11257 v = TEST_COND(result <= 0);
11258 break;
11259 case Py_GE:
11260 v = TEST_COND(result >= 0);
11261 break;
11262 case Py_LT:
11263 v = TEST_COND(result == -1);
11264 break;
11265 case Py_GT:
11266 v = TEST_COND(result == 1);
11267 break;
11268 default:
11269 PyErr_BadArgument();
11270 return NULL;
11271 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011272 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011273 Py_INCREF(v);
11274 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011275}
11276
Alexander Belopolsky40018472011-02-26 01:02:56 +000011277int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011278_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11279{
11280 return unicode_eq(aa, bb);
11281}
11282
11283int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011284PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011285{
Victor Stinner77282cb2013-04-14 19:22:47 +020011286 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 void *buf1, *buf2;
11288 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011289 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011290
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011293 "'in <string>' requires string as left operand, not %.100s",
11294 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011295 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011296 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011298 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 if (ensure_unicode(str) < 0)
11300 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 kind2 = PyUnicode_KIND(substr);
11304 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011305 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011307 len2 = PyUnicode_GET_LENGTH(substr);
11308 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011309 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011310 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011311 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011312 if (len2 == 1) {
11313 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11314 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011315 return result;
11316 }
11317 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011318 buf2 = _PyUnicode_AsKind(substr, kind1);
11319 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011320 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322
Victor Stinner77282cb2013-04-14 19:22:47 +020011323 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 case PyUnicode_1BYTE_KIND:
11325 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11326 break;
11327 case PyUnicode_2BYTE_KIND:
11328 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11329 break;
11330 case PyUnicode_4BYTE_KIND:
11331 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11332 break;
11333 default:
11334 result = -1;
11335 assert(0);
11336 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011337
Victor Stinner77282cb2013-04-14 19:22:47 +020011338 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 PyMem_Free(buf2);
11340
Guido van Rossum403d68b2000-03-13 15:55:09 +000011341 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011342}
11343
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344/* Concat to string or Unicode object giving a new Unicode object. */
11345
Alexander Belopolsky40018472011-02-26 01:02:56 +000011346PyObject *
11347PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011349 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011350 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011351 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011353 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355
11356 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011357 if (left == unicode_empty)
11358 return PyUnicode_FromObject(right);
11359 if (right == unicode_empty)
11360 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011362 left_len = PyUnicode_GET_LENGTH(left);
11363 right_len = PyUnicode_GET_LENGTH(right);
11364 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011365 PyErr_SetString(PyExc_OverflowError,
11366 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011367 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011368 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011369 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011370
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011371 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11372 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011373 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011376 result = PyUnicode_New(new_len, maxchar);
11377 if (result == NULL)
11378 return NULL;
11379 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11380 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11381 assert(_PyUnicode_CheckConsistency(result, 1));
11382 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383}
11384
Walter Dörwald1ab83302007-05-18 17:15:44 +000011385void
Victor Stinner23e56682011-10-03 03:54:37 +020011386PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011387{
Victor Stinner23e56682011-10-03 03:54:37 +020011388 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011389 Py_UCS4 maxchar, maxchar2;
11390 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011391
11392 if (p_left == NULL) {
11393 if (!PyErr_Occurred())
11394 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011395 return;
11396 }
Victor Stinner23e56682011-10-03 03:54:37 +020011397 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011398 if (right == NULL || left == NULL
11399 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011400 if (!PyErr_Occurred())
11401 PyErr_BadInternalCall();
11402 goto error;
11403 }
11404
Benjamin Petersonbac79492012-01-14 13:34:47 -050011405 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011406 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011407 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011408 goto error;
11409
Victor Stinner488fa492011-12-12 00:01:39 +010011410 /* Shortcuts */
11411 if (left == unicode_empty) {
11412 Py_DECREF(left);
11413 Py_INCREF(right);
11414 *p_left = right;
11415 return;
11416 }
11417 if (right == unicode_empty)
11418 return;
11419
11420 left_len = PyUnicode_GET_LENGTH(left);
11421 right_len = PyUnicode_GET_LENGTH(right);
11422 if (left_len > PY_SSIZE_T_MAX - right_len) {
11423 PyErr_SetString(PyExc_OverflowError,
11424 "strings are too large to concat");
11425 goto error;
11426 }
11427 new_len = left_len + right_len;
11428
11429 if (unicode_modifiable(left)
11430 && PyUnicode_CheckExact(right)
11431 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011432 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11433 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011434 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011435 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011436 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11437 {
11438 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011439 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011440 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011441
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011442 /* copy 'right' into the newly allocated area of 'left' */
11443 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011444 }
Victor Stinner488fa492011-12-12 00:01:39 +010011445 else {
11446 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11447 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011448 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011449
Victor Stinner488fa492011-12-12 00:01:39 +010011450 /* Concat the two Unicode strings */
11451 res = PyUnicode_New(new_len, maxchar);
11452 if (res == NULL)
11453 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011454 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11455 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011456 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011457 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011458 }
11459 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011460 return;
11461
11462error:
Victor Stinner488fa492011-12-12 00:01:39 +010011463 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011464}
11465
11466void
11467PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11468{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011469 PyUnicode_Append(pleft, right);
11470 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011471}
11472
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011473/*
11474Wraps stringlib_parse_args_finds() and additionally ensures that the
11475first argument is a unicode object.
11476*/
11477
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011478static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011479parse_args_finds_unicode(const char * function_name, PyObject *args,
11480 PyObject **substring,
11481 Py_ssize_t *start, Py_ssize_t *end)
11482{
11483 if(stringlib_parse_args_finds(function_name, args, substring,
11484 start, end)) {
11485 if (ensure_unicode(*substring) < 0)
11486 return 0;
11487 return 1;
11488 }
11489 return 0;
11490}
11491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011492PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011495Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011496string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
11499static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011500unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011502 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011503 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011504 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011506 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 void *buf1, *buf2;
11508 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011510 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 kind1 = PyUnicode_KIND(self);
11514 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011515 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011516 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 len1 = PyUnicode_GET_LENGTH(self);
11519 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011521 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011522 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011524 buf1 = PyUnicode_DATA(self);
11525 buf2 = PyUnicode_DATA(substring);
11526 if (kind2 != kind1) {
11527 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011528 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011529 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011530 }
11531 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 case PyUnicode_1BYTE_KIND:
11533 iresult = ucs1lib_count(
11534 ((Py_UCS1*)buf1) + start, end - start,
11535 buf2, len2, PY_SSIZE_T_MAX
11536 );
11537 break;
11538 case PyUnicode_2BYTE_KIND:
11539 iresult = ucs2lib_count(
11540 ((Py_UCS2*)buf1) + start, end - start,
11541 buf2, len2, PY_SSIZE_T_MAX
11542 );
11543 break;
11544 case PyUnicode_4BYTE_KIND:
11545 iresult = ucs4lib_count(
11546 ((Py_UCS4*)buf1) + start, end - start,
11547 buf2, len2, PY_SSIZE_T_MAX
11548 );
11549 break;
11550 default:
11551 assert(0); iresult = 0;
11552 }
11553
11554 result = PyLong_FromSsize_t(iresult);
11555
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011556 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 return result;
11560}
11561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011562PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011563 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011565Encode S using the codec registered for encoding. Default encoding\n\
11566is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011567handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011568a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11569'xmlcharrefreplace' as well as any other name registered with\n\
11570codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571
11572static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011573unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011575 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 char *encoding = NULL;
11577 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011578
Benjamin Peterson308d6372009-09-18 21:42:35 +000011579 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11580 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011582 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011583}
11584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011586 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587\n\
11588Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011589If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
11591static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011592unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 Py_ssize_t i, j, line_pos, src_len, incr;
11595 Py_UCS4 ch;
11596 PyObject *u;
11597 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011598 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011600 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011601 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
Ezio Melotti745d54d2013-11-16 19:10:57 +020011603 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11604 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606
Antoine Pitrou22425222011-10-04 19:10:51 +020011607 if (PyUnicode_READY(self) == -1)
11608 return NULL;
11609
Thomas Wouters7e474022000-07-16 12:04:32 +000011610 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011611 src_len = PyUnicode_GET_LENGTH(self);
11612 i = j = line_pos = 0;
11613 kind = PyUnicode_KIND(self);
11614 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011615 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011616 for (; i < src_len; i++) {
11617 ch = PyUnicode_READ(kind, src_data, i);
11618 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011619 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011621 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011623 goto overflow;
11624 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011626 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011630 goto overflow;
11631 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011633 if (ch == '\n' || ch == '\r')
11634 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011636 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011637 if (!found)
11638 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011639
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011641 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 if (!u)
11643 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011644 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645
Antoine Pitroue71d5742011-10-04 15:55:09 +020011646 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647
Antoine Pitroue71d5742011-10-04 15:55:09 +020011648 for (; i < src_len; i++) {
11649 ch = PyUnicode_READ(kind, src_data, i);
11650 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011652 incr = tabsize - (line_pos % tabsize);
11653 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011654 FILL(kind, dest_data, ' ', j, incr);
11655 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011659 line_pos++;
11660 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011661 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011662 if (ch == '\n' || ch == '\r')
11663 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011665 }
11666 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011667 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011668
Antoine Pitroue71d5742011-10-04 15:55:09 +020011669 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011670 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672}
11673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011674PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676\n\
11677Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011678such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679arguments start and end are interpreted as in slice notation.\n\
11680\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011681Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682
11683static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011686 /* initialize variables to prevent gcc warning */
11687 PyObject *substring = NULL;
11688 Py_ssize_t start = 0;
11689 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011690 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011692 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011695 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011698 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 if (result == -2)
11701 return NULL;
11702
Christian Heimes217cfd12007-12-02 14:31:20 +000011703 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704}
11705
11706static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011707unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011709 void *data;
11710 enum PyUnicode_Kind kind;
11711 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011712
Serhiy Storchakaddb536b2017-09-08 10:43:54 +030011713 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011714 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011716 }
Serhiy Storchakaddb536b2017-09-08 10:43:54 +030011717 if (PyUnicode_READY(self) == -1) {
11718 return NULL;
11719 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011720 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11721 PyErr_SetString(PyExc_IndexError, "string index out of range");
11722 return NULL;
11723 }
11724 kind = PyUnicode_KIND(self);
11725 data = PyUnicode_DATA(self);
11726 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011727 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728}
11729
Guido van Rossumc2504932007-09-18 19:42:40 +000011730/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011731 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011732static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011733unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
Guido van Rossumc2504932007-09-18 19:42:40 +000011735 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011736 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011737
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011738#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011739 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011740#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (_PyUnicode_HASH(self) != -1)
11742 return _PyUnicode_HASH(self);
11743 if (PyUnicode_READY(self) == -1)
11744 return -1;
11745 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011746 /*
11747 We make the hash of the empty string be 0, rather than using
11748 (prefix ^ suffix), since this slightly obfuscates the hash secret
11749 */
11750 if (len == 0) {
11751 _PyUnicode_HASH(self) = 0;
11752 return 0;
11753 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011754 x = _Py_HashBytes(PyUnicode_DATA(self),
11755 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011757 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758}
11759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762\n\
Mariatta577fc042017-04-09 15:17:06 -070011763Return the lowest index in S where substring sub is found, \n\
11764such that sub is contained within S[start:end]. Optional\n\
11765arguments start and end are interpreted as in slice notation.\n\
11766\n\
11767Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
11769static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011772 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011773 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011774 PyObject *substring = NULL;
11775 Py_ssize_t start = 0;
11776 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011778 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011781 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011784 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (result == -2)
11787 return NULL;
11788
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 if (result < 0) {
11790 PyErr_SetString(PyExc_ValueError, "substring not found");
11791 return NULL;
11792 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011793
Christian Heimes217cfd12007-12-02 14:31:20 +000011794 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795}
11796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011800Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
11803static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011804unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 Py_ssize_t i, length;
11807 int kind;
11808 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 int cased;
11810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 if (PyUnicode_READY(self) == -1)
11812 return NULL;
11813 length = PyUnicode_GET_LENGTH(self);
11814 kind = PyUnicode_KIND(self);
11815 data = PyUnicode_DATA(self);
11816
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (length == 1)
11819 return PyBool_FromLong(
11820 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011822 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011825
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 for (i = 0; i < length; i++) {
11828 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011829
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11831 return PyBool_FromLong(0);
11832 else if (!cased && Py_UNICODE_ISLOWER(ch))
11833 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011835 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836}
11837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011838PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011841Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011842at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
11844static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011845unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 Py_ssize_t i, length;
11848 int kind;
11849 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 int cased;
11851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (PyUnicode_READY(self) == -1)
11853 return NULL;
11854 length = PyUnicode_GET_LENGTH(self);
11855 kind = PyUnicode_KIND(self);
11856 data = PyUnicode_DATA(self);
11857
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (length == 1)
11860 return PyBool_FromLong(
11861 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011863 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011866
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 for (i = 0; i < length; i++) {
11869 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011870
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11872 return PyBool_FromLong(0);
11873 else if (!cased && Py_UNICODE_ISUPPER(ch))
11874 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011876 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877}
11878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011879PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011882Return True if S is a titlecased string and there is at least one\n\
11883character in S, i.e. upper- and titlecase characters may only\n\
11884follow uncased characters and lowercase characters only cased ones.\n\
11885Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
11887static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011888unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 Py_ssize_t i, length;
11891 int kind;
11892 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 int cased, previous_is_cased;
11894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (PyUnicode_READY(self) == -1)
11896 return NULL;
11897 length = PyUnicode_GET_LENGTH(self);
11898 kind = PyUnicode_KIND(self);
11899 data = PyUnicode_DATA(self);
11900
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (length == 1) {
11903 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11904 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11905 (Py_UNICODE_ISUPPER(ch) != 0));
11906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011908 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011911
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 cased = 0;
11913 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 for (i = 0; i < length; i++) {
11915 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011916
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11918 if (previous_is_cased)
11919 return PyBool_FromLong(0);
11920 previous_is_cased = 1;
11921 cased = 1;
11922 }
11923 else if (Py_UNICODE_ISLOWER(ch)) {
11924 if (!previous_is_cased)
11925 return PyBool_FromLong(0);
11926 previous_is_cased = 1;
11927 cased = 1;
11928 }
11929 else
11930 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011932 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933}
11934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011935PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011938Return True if all characters in S are whitespace\n\
11939and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940
11941static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011942unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 Py_ssize_t i, length;
11945 int kind;
11946 void *data;
11947
11948 if (PyUnicode_READY(self) == -1)
11949 return NULL;
11950 length = PyUnicode_GET_LENGTH(self);
11951 kind = PyUnicode_KIND(self);
11952 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 if (length == 1)
11956 return PyBool_FromLong(
11957 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011959 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 for (i = 0; i < length; i++) {
11964 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011965 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011968 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969}
11970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011971PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011973\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011974Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011975and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976
11977static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011978unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 Py_ssize_t i, length;
11981 int kind;
11982 void *data;
11983
11984 if (PyUnicode_READY(self) == -1)
11985 return NULL;
11986 length = PyUnicode_GET_LENGTH(self);
11987 kind = PyUnicode_KIND(self);
11988 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011989
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011990 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 if (length == 1)
11992 return PyBool_FromLong(
11993 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011994
11995 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 for (i = 0; i < length; i++) {
12000 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012003 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012004}
12005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012006PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012008\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000012009Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012010and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012011
12012static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012013unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012014{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 int kind;
12016 void *data;
12017 Py_ssize_t len, i;
12018
12019 if (PyUnicode_READY(self) == -1)
12020 return NULL;
12021
12022 kind = PyUnicode_KIND(self);
12023 data = PyUnicode_DATA(self);
12024 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012025
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012026 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (len == 1) {
12028 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12029 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12030 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012031
12032 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 for (i = 0; i < len; i++) {
12037 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012038 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000012039 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012040 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012041 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012042}
12043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012044PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000012047Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012048False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
12050static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012051unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 Py_ssize_t i, length;
12054 int kind;
12055 void *data;
12056
12057 if (PyUnicode_READY(self) == -1)
12058 return NULL;
12059 length = PyUnicode_GET_LENGTH(self);
12060 kind = PyUnicode_KIND(self);
12061 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (length == 1)
12065 return PyBool_FromLong(
12066 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012068 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 for (i = 0; i < length; i++) {
12073 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012076 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077}
12078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012079PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000012082Return True if all characters in S are digits\n\
12083and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
12085static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012086unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 Py_ssize_t i, length;
12089 int kind;
12090 void *data;
12091
12092 if (PyUnicode_READY(self) == -1)
12093 return NULL;
12094 length = PyUnicode_GET_LENGTH(self);
12095 kind = PyUnicode_KIND(self);
12096 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (length == 1) {
12100 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12101 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012104 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 for (i = 0; i < length; i++) {
12109 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012112 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012115PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000012118Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012119False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
12121static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012122unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 Py_ssize_t i, length;
12125 int kind;
12126 void *data;
12127
12128 if (PyUnicode_READY(self) == -1)
12129 return NULL;
12130 length = PyUnicode_GET_LENGTH(self);
12131 kind = PyUnicode_KIND(self);
12132 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 if (length == 1)
12136 return PyBool_FromLong(
12137 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012139 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 for (i = 0; i < length; i++) {
12144 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012147 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148}
12149
Martin v. Löwis47383402007-08-15 07:32:56 +000012150int
12151PyUnicode_IsIdentifier(PyObject *self)
12152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 int kind;
12154 void *data;
12155 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012156 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 if (PyUnicode_READY(self) == -1) {
12159 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 }
12162
12163 /* Special case for empty strings */
12164 if (PyUnicode_GET_LENGTH(self) == 0)
12165 return 0;
12166 kind = PyUnicode_KIND(self);
12167 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012168
12169 /* PEP 3131 says that the first character must be in
12170 XID_Start and subsequent characters in XID_Continue,
12171 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012172 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012173 letters, digits, underscore). However, given the current
12174 definition of XID_Start and XID_Continue, it is sufficient
12175 to check just for these, except that _ must be allowed
12176 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012178 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012179 return 0;
12180
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012181 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012183 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012184 return 1;
12185}
12186
12187PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012189\n\
12190Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012191to the language definition.\n\
12192\n\
12193Use keyword.iskeyword() to test for reserved identifiers\n\
12194such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012195
12196static PyObject*
12197unicode_isidentifier(PyObject *self)
12198{
12199 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12200}
12201
Georg Brandl559e5d72008-06-11 18:37:52 +000012202PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012204\n\
12205Return True if all characters in S are considered\n\
12206printable in repr() or S is empty, False otherwise.");
12207
12208static PyObject*
12209unicode_isprintable(PyObject *self)
12210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 Py_ssize_t i, length;
12212 int kind;
12213 void *data;
12214
12215 if (PyUnicode_READY(self) == -1)
12216 return NULL;
12217 length = PyUnicode_GET_LENGTH(self);
12218 kind = PyUnicode_KIND(self);
12219 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012220
12221 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 if (length == 1)
12223 return PyBool_FromLong(
12224 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 for (i = 0; i < length; i++) {
12227 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012228 Py_RETURN_FALSE;
12229 }
12230 }
12231 Py_RETURN_TRUE;
12232}
12233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012234PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012235 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236\n\
12237Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012238iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239
12240static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012241unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012243 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244}
12245
Martin v. Löwis18e16552006-02-15 17:27:45 +000012246static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012247unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 if (PyUnicode_READY(self) == -1)
12250 return -1;
12251 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252}
12253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012254PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012257Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012258done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259
12260static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012261unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012263 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 Py_UCS4 fillchar = ' ';
12265
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012266 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 return NULL;
12268
Benjamin Petersonbac79492012-01-14 13:34:47 -050012269 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271
Victor Stinnerc4b49542011-12-11 22:44:26 +010012272 if (PyUnicode_GET_LENGTH(self) >= width)
12273 return unicode_result_unchanged(self);
12274
12275 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276}
12277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012278PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012281Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
12283static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012284unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012286 if (PyUnicode_READY(self) == -1)
12287 return NULL;
12288 if (PyUnicode_IS_ASCII(self))
12289 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012290 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291}
12292
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293#define LEFTSTRIP 0
12294#define RIGHTSTRIP 1
12295#define BOTHSTRIP 2
12296
12297/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012298static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299
12300#define STRIPNAME(i) (stripformat[i]+3)
12301
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302/* externally visible for str.strip(unicode) */
12303PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012304_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 void *data;
12307 int kind;
12308 Py_ssize_t i, j, len;
12309 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012310 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12313 return NULL;
12314
12315 kind = PyUnicode_KIND(self);
12316 data = PyUnicode_DATA(self);
12317 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012318 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12320 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012321 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012322
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 i = 0;
12324 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012325 while (i < len) {
12326 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12327 if (!BLOOM(sepmask, ch))
12328 break;
12329 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12330 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 i++;
12332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 j = len;
12336 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012337 j--;
12338 while (j >= i) {
12339 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12340 if (!BLOOM(sepmask, ch))
12341 break;
12342 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12343 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012345 }
12346
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012348 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012349
Victor Stinner7931d9a2011-11-04 00:22:48 +010012350 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351}
12352
12353PyObject*
12354PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12355{
12356 unsigned char *data;
12357 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012358 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359
Victor Stinnerde636f32011-10-01 03:55:54 +020012360 if (PyUnicode_READY(self) == -1)
12361 return NULL;
12362
Victor Stinner684d5fd2012-05-03 02:32:34 +020012363 length = PyUnicode_GET_LENGTH(self);
12364 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012365
Victor Stinner684d5fd2012-05-03 02:32:34 +020012366 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012367 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368
Victor Stinnerde636f32011-10-01 03:55:54 +020012369 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012370 PyErr_SetString(PyExc_IndexError, "string index out of range");
12371 return NULL;
12372 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012373 if (start >= length || end < start)
12374 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012375
Victor Stinner684d5fd2012-05-03 02:32:34 +020012376 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012377 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012378 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012379 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012380 }
12381 else {
12382 kind = PyUnicode_KIND(self);
12383 data = PyUnicode_1BYTE_DATA(self);
12384 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012385 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012386 length);
12387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389
12390static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012391do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 Py_ssize_t len, i, j;
12394
12395 if (PyUnicode_READY(self) == -1)
12396 return NULL;
12397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012399
Victor Stinnercc7af722013-04-09 22:39:24 +020012400 if (PyUnicode_IS_ASCII(self)) {
12401 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12402
12403 i = 0;
12404 if (striptype != RIGHTSTRIP) {
12405 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012406 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012407 if (!_Py_ascii_whitespace[ch])
12408 break;
12409 i++;
12410 }
12411 }
12412
12413 j = len;
12414 if (striptype != LEFTSTRIP) {
12415 j--;
12416 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012417 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012418 if (!_Py_ascii_whitespace[ch])
12419 break;
12420 j--;
12421 }
12422 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 }
12424 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012425 else {
12426 int kind = PyUnicode_KIND(self);
12427 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012428
Victor Stinnercc7af722013-04-09 22:39:24 +020012429 i = 0;
12430 if (striptype != RIGHTSTRIP) {
12431 while (i < len) {
12432 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12433 if (!Py_UNICODE_ISSPACE(ch))
12434 break;
12435 i++;
12436 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012437 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012438
12439 j = len;
12440 if (striptype != LEFTSTRIP) {
12441 j--;
12442 while (j >= i) {
12443 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12444 if (!Py_UNICODE_ISSPACE(ch))
12445 break;
12446 j--;
12447 }
12448 j++;
12449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012451
Victor Stinner7931d9a2011-11-04 00:22:48 +010012452 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453}
12454
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012455
12456static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012457do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460
Serhiy Storchakac6792272013-10-19 21:03:34 +030012461 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 if (sep != NULL && sep != Py_None) {
12465 if (PyUnicode_Check(sep))
12466 return _PyUnicode_XStrip(self, striptype, sep);
12467 else {
12468 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 "%s arg must be None or str",
12470 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 return NULL;
12472 }
12473 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012474
Benjamin Peterson14339b62009-01-31 16:36:08 +000012475 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012476}
12477
12478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012479PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481\n\
12482Return a copy of the string S with leading and trailing\n\
12483whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012484If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012485
12486static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012487unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012488{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 if (PyTuple_GET_SIZE(args) == 0)
12490 return do_strip(self, BOTHSTRIP); /* Common case */
12491 else
12492 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012493}
12494
12495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012496PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498\n\
12499Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012500If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012501
12502static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012503unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012504{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012505 if (PyTuple_GET_SIZE(args) == 0)
12506 return do_strip(self, LEFTSTRIP); /* Common case */
12507 else
12508 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012509}
12510
12511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012514\n\
12515Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012516If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012517
12518static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012519unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012520{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012521 if (PyTuple_GET_SIZE(args) == 0)
12522 return do_strip(self, RIGHTSTRIP); /* Common case */
12523 else
12524 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012525}
12526
12527
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012529unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012531 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533
Serhiy Storchaka05997252013-01-26 12:14:02 +020012534 if (len < 1)
12535 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
Victor Stinnerc4b49542011-12-11 22:44:26 +010012537 /* no repeat, return original string */
12538 if (len == 1)
12539 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012540
Benjamin Petersonbac79492012-01-14 13:34:47 -050012541 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 return NULL;
12543
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012544 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012545 PyErr_SetString(PyExc_OverflowError,
12546 "repeated string is too long");
12547 return NULL;
12548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012550
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012551 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 if (!u)
12553 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012554 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 if (PyUnicode_GET_LENGTH(str) == 1) {
12557 const int kind = PyUnicode_KIND(str);
12558 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012559 if (kind == PyUnicode_1BYTE_KIND) {
12560 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012561 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012562 }
12563 else if (kind == PyUnicode_2BYTE_KIND) {
12564 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012565 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012566 ucs2[n] = fill_char;
12567 } else {
12568 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12569 assert(kind == PyUnicode_4BYTE_KIND);
12570 for (n = 0; n < len; ++n)
12571 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 }
12574 else {
12575 /* number of characters copied this far */
12576 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012577 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012579 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012581 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012583 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012584 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586 }
12587
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012588 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012589 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590}
12591
Alexander Belopolsky40018472011-02-26 01:02:56 +000012592PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012593PyUnicode_Replace(PyObject *str,
12594 PyObject *substr,
12595 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012596 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012598 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12599 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012601 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602}
12603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012604PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012605 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606\n\
12607Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012608old replaced by new. If the optional argument count is\n\
12609given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610
12611static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 PyObject *str1;
12615 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012616 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012618 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012620 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012622 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623}
12624
Alexander Belopolsky40018472011-02-26 01:02:56 +000012625static PyObject *
12626unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012628 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 Py_ssize_t isize;
12630 Py_ssize_t osize, squote, dquote, i, o;
12631 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012632 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012636 return NULL;
12637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 isize = PyUnicode_GET_LENGTH(unicode);
12639 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 /* Compute length of output, quote characters, and
12642 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012643 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 max = 127;
12645 squote = dquote = 0;
12646 ikind = PyUnicode_KIND(unicode);
12647 for (i = 0; i < isize; i++) {
12648 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012649 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 case '\'': squote++; break;
12652 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012654 incr = 2;
12655 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 default:
12657 /* Fast-path ASCII */
12658 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012659 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012661 ;
12662 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012665 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012669 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012671 if (osize > PY_SSIZE_T_MAX - incr) {
12672 PyErr_SetString(PyExc_OverflowError,
12673 "string is too long to generate repr");
12674 return NULL;
12675 }
12676 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 }
12678
12679 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012680 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012682 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 if (dquote)
12684 /* Both squote and dquote present. Use squote,
12685 and escape them */
12686 osize += squote;
12687 else
12688 quote = '"';
12689 }
Victor Stinner55c08782013-04-14 18:45:39 +020012690 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691
12692 repr = PyUnicode_New(osize, max);
12693 if (repr == NULL)
12694 return NULL;
12695 okind = PyUnicode_KIND(repr);
12696 odata = PyUnicode_DATA(repr);
12697
12698 PyUnicode_WRITE(okind, odata, 0, quote);
12699 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012700 if (unchanged) {
12701 _PyUnicode_FastCopyCharacters(repr, 1,
12702 unicode, 0,
12703 isize);
12704 }
12705 else {
12706 for (i = 0, o = 1; i < isize; i++) {
12707 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708
Victor Stinner55c08782013-04-14 18:45:39 +020012709 /* Escape quotes and backslashes */
12710 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012711 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012713 continue;
12714 }
12715
12716 /* Map special whitespace to '\t', \n', '\r' */
12717 if (ch == '\t') {
12718 PyUnicode_WRITE(okind, odata, o++, '\\');
12719 PyUnicode_WRITE(okind, odata, o++, 't');
12720 }
12721 else if (ch == '\n') {
12722 PyUnicode_WRITE(okind, odata, o++, '\\');
12723 PyUnicode_WRITE(okind, odata, o++, 'n');
12724 }
12725 else if (ch == '\r') {
12726 PyUnicode_WRITE(okind, odata, o++, '\\');
12727 PyUnicode_WRITE(okind, odata, o++, 'r');
12728 }
12729
12730 /* Map non-printable US ASCII to '\xhh' */
12731 else if (ch < ' ' || ch == 0x7F) {
12732 PyUnicode_WRITE(okind, odata, o++, '\\');
12733 PyUnicode_WRITE(okind, odata, o++, 'x');
12734 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12735 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12736 }
12737
12738 /* Copy ASCII characters as-is */
12739 else if (ch < 0x7F) {
12740 PyUnicode_WRITE(okind, odata, o++, ch);
12741 }
12742
12743 /* Non-ASCII characters */
12744 else {
12745 /* Map Unicode whitespace and control characters
12746 (categories Z* and C* except ASCII space)
12747 */
12748 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12749 PyUnicode_WRITE(okind, odata, o++, '\\');
12750 /* Map 8-bit characters to '\xhh' */
12751 if (ch <= 0xff) {
12752 PyUnicode_WRITE(okind, odata, o++, 'x');
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12755 }
12756 /* Map 16-bit characters to '\uxxxx' */
12757 else if (ch <= 0xffff) {
12758 PyUnicode_WRITE(okind, odata, o++, 'u');
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12763 }
12764 /* Map 21-bit characters to '\U00xxxxxx' */
12765 else {
12766 PyUnicode_WRITE(okind, odata, o++, 'U');
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12775 }
12776 }
12777 /* Copy characters as-is */
12778 else {
12779 PyUnicode_WRITE(okind, odata, o++, ch);
12780 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012781 }
12782 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012785 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012786 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787}
12788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012789PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012790 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791\n\
12792Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012793such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794arguments start and end are interpreted as in slice notation.\n\
12795\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012796Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797
12798static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012801 /* initialize variables to prevent gcc warning */
12802 PyObject *substring = NULL;
12803 Py_ssize_t start = 0;
12804 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012807 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012810 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012813 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 if (result == -2)
12816 return NULL;
12817
Christian Heimes217cfd12007-12-02 14:31:20 +000012818 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819}
12820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012821PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823\n\
Mariatta577fc042017-04-09 15:17:06 -070012824Return the highest index in S where substring sub is found,\n\
12825such that sub is contained within S[start:end]. Optional\n\
12826arguments start and end are interpreted as in slice notation.\n\
12827\n\
12828Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
12830static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012833 /* initialize variables to prevent gcc warning */
12834 PyObject *substring = NULL;
12835 Py_ssize_t start = 0;
12836 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012837 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012839 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012842 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012845 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 if (result == -2)
12848 return NULL;
12849
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850 if (result < 0) {
12851 PyErr_SetString(PyExc_ValueError, "substring not found");
12852 return NULL;
12853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854
Christian Heimes217cfd12007-12-02 14:31:20 +000012855 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856}
12857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012858PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012861Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012862done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863
12864static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012865unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012867 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012868 Py_UCS4 fillchar = ' ';
12869
Victor Stinnere9a29352011-10-01 02:14:59 +020012870 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012872
Benjamin Petersonbac79492012-01-14 13:34:47 -050012873 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874 return NULL;
12875
Victor Stinnerc4b49542011-12-11 22:44:26 +010012876 if (PyUnicode_GET_LENGTH(self) >= width)
12877 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878
Victor Stinnerc4b49542011-12-11 22:44:26 +010012879 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880}
12881
Alexander Belopolsky40018472011-02-26 01:02:56 +000012882PyObject *
12883PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012885 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012888 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889}
12890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012891PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012892 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893\n\
12894Return a list of the words in S, using sep as the\n\
12895delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012896splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012897whitespace string is a separator and empty strings are\n\
12898removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899
12900static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012901unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012903 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012905 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012907 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12908 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909 return NULL;
12910
12911 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012913
12914 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012915 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012916
12917 PyErr_Format(PyExc_TypeError,
12918 "must be str or None, not %.100s",
12919 Py_TYPE(substring)->tp_name);
12920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921}
12922
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012924PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012926 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012927 int kind1, kind2;
12928 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933
Victor Stinner14f8f022011-10-05 20:58:25 +020012934 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 len1 = PyUnicode_GET_LENGTH(str_obj);
12937 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012938 if (kind1 < kind2 || len1 < len2) {
12939 _Py_INCREF_UNICODE_EMPTY();
12940 if (!unicode_empty)
12941 out = NULL;
12942 else {
12943 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12944 Py_DECREF(unicode_empty);
12945 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012946 return out;
12947 }
12948 buf1 = PyUnicode_DATA(str_obj);
12949 buf2 = PyUnicode_DATA(sep_obj);
12950 if (kind2 != kind1) {
12951 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12952 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012953 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012956 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012958 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12959 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12960 else
12961 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 break;
12963 case PyUnicode_2BYTE_KIND:
12964 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 break;
12966 case PyUnicode_4BYTE_KIND:
12967 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968 break;
12969 default:
12970 assert(0);
12971 out = 0;
12972 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012974 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976
12977 return out;
12978}
12979
12980
12981PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012982PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012985 int kind1, kind2;
12986 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012989 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012990 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012992 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 len1 = PyUnicode_GET_LENGTH(str_obj);
12995 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012996 if (kind1 < kind2 || len1 < len2) {
12997 _Py_INCREF_UNICODE_EMPTY();
12998 if (!unicode_empty)
12999 out = NULL;
13000 else {
13001 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13002 Py_DECREF(unicode_empty);
13003 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013004 return out;
13005 }
13006 buf1 = PyUnicode_DATA(str_obj);
13007 buf2 = PyUnicode_DATA(sep_obj);
13008 if (kind2 != kind1) {
13009 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13010 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013011 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013014 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013016 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13017 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13018 else
13019 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 break;
13021 case PyUnicode_2BYTE_KIND:
13022 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13023 break;
13024 case PyUnicode_4BYTE_KIND:
13025 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13026 break;
13027 default:
13028 assert(0);
13029 out = 0;
13030 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013031
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013032 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013034
13035 return out;
13036}
13037
13038PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013039 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013040\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000013041Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013042the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000013043found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044
13045static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013046unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013047{
Victor Stinner9310abb2011-10-05 00:59:23 +020013048 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013049}
13050
13051PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000013052 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000013054Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000013056separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013057
13058static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013059unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060{
Victor Stinner9310abb2011-10-05 00:59:23 +020013061 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062}
13063
Alexander Belopolsky40018472011-02-26 01:02:56 +000013064PyObject *
13065PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013066{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013067 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013068 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013069
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013070 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013071}
13072
13073PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013074 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013075\n\
13076Return a list of the words in S, using sep as the\n\
13077delimiter string, starting at the end of the string and\n\
13078working to the front. If maxsplit is given, at most maxsplit\n\
13079splits are done. If sep is not specified, any whitespace string\n\
13080is a separator.");
13081
13082static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013083unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013084{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013085 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013086 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013087 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013088
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013089 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13090 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013091 return NULL;
13092
13093 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000013094 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013095
13096 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020013097 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013098
13099 PyErr_Format(PyExc_TypeError,
13100 "must be str or None, not %.100s",
13101 Py_TYPE(substring)->tp_name);
13102 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013103}
13104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013105PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107\n\
13108Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000013109Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013110is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111
13112static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013113unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013115 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000013116 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013118 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13119 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120 return NULL;
13121
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013122 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123}
13124
13125static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013126PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013128 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129}
13130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013131PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133\n\
13134Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013135and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
13137static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013138unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013140 if (PyUnicode_READY(self) == -1)
13141 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013142 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143}
13144
Larry Hastings61272b72014-01-07 12:41:53 -080013145/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013146
Larry Hastings31826802013-10-19 00:09:25 -070013147@staticmethod
13148str.maketrans as unicode_maketrans
13149
13150 x: object
13151
13152 y: unicode=NULL
13153
13154 z: unicode=NULL
13155
13156 /
13157
13158Return a translation table usable for str.translate().
13159
13160If there is only one argument, it must be a dictionary mapping Unicode
13161ordinals (integers) or characters to Unicode ordinals, strings or None.
13162Character keys will be then converted to ordinals.
13163If there are two arguments, they must be strings of equal length, and
13164in the resulting dictionary, each character in x will be mapped to the
13165character at the same position in y. If there is a third argument, it
13166must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013167[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013168
Larry Hastings31826802013-10-19 00:09:25 -070013169static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013170unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013171/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013172{
Georg Brandlceee0772007-11-27 23:48:05 +000013173 PyObject *new = NULL, *key, *value;
13174 Py_ssize_t i = 0;
13175 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176
Georg Brandlceee0772007-11-27 23:48:05 +000013177 new = PyDict_New();
13178 if (!new)
13179 return NULL;
13180 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013181 int x_kind, y_kind, z_kind;
13182 void *x_data, *y_data, *z_data;
13183
Georg Brandlceee0772007-11-27 23:48:05 +000013184 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013185 if (!PyUnicode_Check(x)) {
13186 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13187 "be a string if there is a second argument");
13188 goto err;
13189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013191 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13192 "arguments must have equal length");
13193 goto err;
13194 }
13195 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 x_kind = PyUnicode_KIND(x);
13197 y_kind = PyUnicode_KIND(y);
13198 x_data = PyUnicode_DATA(x);
13199 y_data = PyUnicode_DATA(y);
13200 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13201 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013202 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013203 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013204 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013205 if (!value) {
13206 Py_DECREF(key);
13207 goto err;
13208 }
Georg Brandlceee0772007-11-27 23:48:05 +000013209 res = PyDict_SetItem(new, key, value);
13210 Py_DECREF(key);
13211 Py_DECREF(value);
13212 if (res < 0)
13213 goto err;
13214 }
13215 /* create entries for deleting chars in z */
13216 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 z_kind = PyUnicode_KIND(z);
13218 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013219 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013221 if (!key)
13222 goto err;
13223 res = PyDict_SetItem(new, key, Py_None);
13224 Py_DECREF(key);
13225 if (res < 0)
13226 goto err;
13227 }
13228 }
13229 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 int kind;
13231 void *data;
13232
Georg Brandlceee0772007-11-27 23:48:05 +000013233 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013234 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013235 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13236 "to maketrans it must be a dict");
13237 goto err;
13238 }
13239 /* copy entries into the new dict, converting string keys to int keys */
13240 while (PyDict_Next(x, &i, &key, &value)) {
13241 if (PyUnicode_Check(key)) {
13242 /* convert string keys to integer keys */
13243 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013244 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013245 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13246 "table must be of length 1");
13247 goto err;
13248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013249 kind = PyUnicode_KIND(key);
13250 data = PyUnicode_DATA(key);
13251 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013252 if (!newkey)
13253 goto err;
13254 res = PyDict_SetItem(new, newkey, value);
13255 Py_DECREF(newkey);
13256 if (res < 0)
13257 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013258 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013259 /* just keep integer keys */
13260 if (PyDict_SetItem(new, key, value) < 0)
13261 goto err;
13262 } else {
13263 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13264 "be strings or integers");
13265 goto err;
13266 }
13267 }
13268 }
13269 return new;
13270 err:
13271 Py_DECREF(new);
13272 return NULL;
13273}
13274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013275PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013278Return a copy of the string S in which each character has been mapped\n\
13279through the given translation table. The table must implement\n\
13280lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13281mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13282this operation raises LookupError, the character is left untouched.\n\
13283Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
13285static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289}
13290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013291PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013294Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295
13296static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013297unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013299 if (PyUnicode_READY(self) == -1)
13300 return NULL;
13301 if (PyUnicode_IS_ASCII(self))
13302 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013303 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304}
13305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013306PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013307 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013309Pad a numeric string S with zeros on the left, to fill a field\n\
13310of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311
13312static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013313unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013315 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013316 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013317 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 int kind;
13319 void *data;
13320 Py_UCS4 chr;
13321
Martin v. Löwis18e16552006-02-15 17:27:45 +000013322 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323 return NULL;
13324
Benjamin Petersonbac79492012-01-14 13:34:47 -050013325 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
Victor Stinnerc4b49542011-12-11 22:44:26 +010013328 if (PyUnicode_GET_LENGTH(self) >= width)
13329 return unicode_result_unchanged(self);
13330
13331 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013332
13333 u = pad(self, fill, 0, '0');
13334
Walter Dörwald068325e2002-04-15 13:36:47 +000013335 if (u == NULL)
13336 return NULL;
13337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 kind = PyUnicode_KIND(u);
13339 data = PyUnicode_DATA(u);
13340 chr = PyUnicode_READ(kind, data, fill);
13341
13342 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013344 PyUnicode_WRITE(kind, data, 0, chr);
13345 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346 }
13347
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013348 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013349 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351
13352#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013353static PyObject *
13354unicode__decimal2ascii(PyObject *self)
13355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013356 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013357}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358#endif
13359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013360PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013363Return True if S starts with the specified prefix, False otherwise.\n\
13364With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013365With optional end, stop comparing S at that position.\n\
13366prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367
13368static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013369unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013370 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013372 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013373 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013374 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013375 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013376 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377
Jesus Ceaac451502011-04-20 17:09:23 +020013378 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013379 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013380 if (PyTuple_Check(subobj)) {
13381 Py_ssize_t i;
13382 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013383 substring = PyTuple_GET_ITEM(subobj, i);
13384 if (!PyUnicode_Check(substring)) {
13385 PyErr_Format(PyExc_TypeError,
13386 "tuple for startswith must only contain str, "
13387 "not %.100s",
13388 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013389 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013390 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013391 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013392 if (result == -1)
13393 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013394 if (result) {
13395 Py_RETURN_TRUE;
13396 }
13397 }
13398 /* nothing matched */
13399 Py_RETURN_FALSE;
13400 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013401 if (!PyUnicode_Check(subobj)) {
13402 PyErr_Format(PyExc_TypeError,
13403 "startswith first arg must be str or "
13404 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013406 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013407 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013408 if (result == -1)
13409 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013410 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411}
13412
13413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013414PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013417Return True if S ends with the specified suffix, False otherwise.\n\
13418With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013419With optional end, stop comparing S at that position.\n\
13420suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421
13422static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013423unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013426 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013427 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013428 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013429 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013430 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431
Jesus Ceaac451502011-04-20 17:09:23 +020013432 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013434 if (PyTuple_Check(subobj)) {
13435 Py_ssize_t i;
13436 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013437 substring = PyTuple_GET_ITEM(subobj, i);
13438 if (!PyUnicode_Check(substring)) {
13439 PyErr_Format(PyExc_TypeError,
13440 "tuple for endswith must only contain str, "
13441 "not %.100s",
13442 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013444 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013445 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013446 if (result == -1)
13447 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013448 if (result) {
13449 Py_RETURN_TRUE;
13450 }
13451 }
13452 Py_RETURN_FALSE;
13453 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013454 if (!PyUnicode_Check(subobj)) {
13455 PyErr_Format(PyExc_TypeError,
13456 "endswith first arg must be str or "
13457 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013459 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013460 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013461 if (result == -1)
13462 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013463 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013464}
13465
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013466static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013467_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013468{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013469 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13470 writer->data = PyUnicode_DATA(writer->buffer);
13471
13472 if (!writer->readonly) {
13473 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013474 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013475 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013476 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013477 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13478 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13479 writer->kind = PyUnicode_WCHAR_KIND;
13480 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13481
Victor Stinner8f674cc2013-04-17 23:02:17 +020013482 /* Copy-on-write mode: set buffer size to 0 so
13483 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13484 * next write. */
13485 writer->size = 0;
13486 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013487}
13488
Victor Stinnerd3f08822012-05-29 12:57:52 +020013489void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013490_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013491{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013492 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013493
13494 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013495 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013496
13497 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13498 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13499 writer->kind = PyUnicode_WCHAR_KIND;
13500 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013501}
13502
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503int
13504_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13505 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013506{
13507 Py_ssize_t newlen;
13508 PyObject *newbuffer;
13509
Victor Stinner2740e462016-09-06 16:58:36 -070013510 assert(maxchar <= MAX_UNICODE);
13511
Victor Stinnerca9381e2015-09-22 00:58:32 +020013512 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013513 assert((maxchar > writer->maxchar && length >= 0)
13514 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515
Victor Stinner202fdca2012-05-07 12:47:02 +020013516 if (length > PY_SSIZE_T_MAX - writer->pos) {
13517 PyErr_NoMemory();
13518 return -1;
13519 }
13520 newlen = writer->pos + length;
13521
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013522 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013523
Victor Stinnerd3f08822012-05-29 12:57:52 +020013524 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013525 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013526 if (writer->overallocate
13527 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13528 /* overallocate to limit the number of realloc() */
13529 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013531 if (newlen < writer->min_length)
13532 newlen = writer->min_length;
13533
Victor Stinnerd3f08822012-05-29 12:57:52 +020013534 writer->buffer = PyUnicode_New(newlen, maxchar);
13535 if (writer->buffer == NULL)
13536 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013538 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013539 if (writer->overallocate
13540 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13541 /* overallocate to limit the number of realloc() */
13542 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013544 if (newlen < writer->min_length)
13545 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013547 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013548 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013549 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013550 newbuffer = PyUnicode_New(newlen, maxchar);
13551 if (newbuffer == NULL)
13552 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13554 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013555 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013556 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013557 }
13558 else {
13559 newbuffer = resize_compact(writer->buffer, newlen);
13560 if (newbuffer == NULL)
13561 return -1;
13562 }
13563 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013564 }
13565 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013566 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567 newbuffer = PyUnicode_New(writer->size, maxchar);
13568 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013569 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13571 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013572 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013573 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013574 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013575 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013576
13577#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013578}
13579
Victor Stinnerca9381e2015-09-22 00:58:32 +020013580int
13581_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13582 enum PyUnicode_Kind kind)
13583{
13584 Py_UCS4 maxchar;
13585
13586 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13587 assert(writer->kind < kind);
13588
13589 switch (kind)
13590 {
13591 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13592 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13593 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13594 default:
13595 assert(0 && "invalid kind");
13596 return -1;
13597 }
13598
13599 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13600}
13601
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013602static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013603_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013604{
Victor Stinner2740e462016-09-06 16:58:36 -070013605 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013606 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13607 return -1;
13608 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13609 writer->pos++;
13610 return 0;
13611}
13612
13613int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013614_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13615{
13616 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13617}
13618
13619int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013620_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13621{
13622 Py_UCS4 maxchar;
13623 Py_ssize_t len;
13624
13625 if (PyUnicode_READY(str) == -1)
13626 return -1;
13627 len = PyUnicode_GET_LENGTH(str);
13628 if (len == 0)
13629 return 0;
13630 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13631 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013632 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013633 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013634 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013635 Py_INCREF(str);
13636 writer->buffer = str;
13637 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013638 writer->pos += len;
13639 return 0;
13640 }
13641 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13642 return -1;
13643 }
13644 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13645 str, 0, len);
13646 writer->pos += len;
13647 return 0;
13648}
13649
Victor Stinnere215d962012-10-06 23:03:36 +020013650int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013651_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13652 Py_ssize_t start, Py_ssize_t end)
13653{
13654 Py_UCS4 maxchar;
13655 Py_ssize_t len;
13656
13657 if (PyUnicode_READY(str) == -1)
13658 return -1;
13659
13660 assert(0 <= start);
13661 assert(end <= PyUnicode_GET_LENGTH(str));
13662 assert(start <= end);
13663
13664 if (end == 0)
13665 return 0;
13666
13667 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13668 return _PyUnicodeWriter_WriteStr(writer, str);
13669
13670 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13671 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13672 else
13673 maxchar = writer->maxchar;
13674 len = end - start;
13675
13676 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13677 return -1;
13678
13679 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13680 str, start, len);
13681 writer->pos += len;
13682 return 0;
13683}
13684
13685int
Victor Stinner4a587072013-11-19 12:54:53 +010013686_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13687 const char *ascii, Py_ssize_t len)
13688{
13689 if (len == -1)
13690 len = strlen(ascii);
13691
13692 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13693
13694 if (writer->buffer == NULL && !writer->overallocate) {
13695 PyObject *str;
13696
13697 str = _PyUnicode_FromASCII(ascii, len);
13698 if (str == NULL)
13699 return -1;
13700
13701 writer->readonly = 1;
13702 writer->buffer = str;
13703 _PyUnicodeWriter_Update(writer);
13704 writer->pos += len;
13705 return 0;
13706 }
13707
13708 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13709 return -1;
13710
13711 switch (writer->kind)
13712 {
13713 case PyUnicode_1BYTE_KIND:
13714 {
13715 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13716 Py_UCS1 *data = writer->data;
13717
Christian Heimesf051e432016-09-13 20:22:02 +020013718 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013719 break;
13720 }
13721 case PyUnicode_2BYTE_KIND:
13722 {
13723 _PyUnicode_CONVERT_BYTES(
13724 Py_UCS1, Py_UCS2,
13725 ascii, ascii + len,
13726 (Py_UCS2 *)writer->data + writer->pos);
13727 break;
13728 }
13729 case PyUnicode_4BYTE_KIND:
13730 {
13731 _PyUnicode_CONVERT_BYTES(
13732 Py_UCS1, Py_UCS4,
13733 ascii, ascii + len,
13734 (Py_UCS4 *)writer->data + writer->pos);
13735 break;
13736 }
13737 default:
13738 assert(0);
13739 }
13740
13741 writer->pos += len;
13742 return 0;
13743}
13744
13745int
13746_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13747 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013748{
13749 Py_UCS4 maxchar;
13750
13751 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13752 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13753 return -1;
13754 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13755 writer->pos += len;
13756 return 0;
13757}
13758
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013760_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013761{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013762 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013763
Victor Stinnerd3f08822012-05-29 12:57:52 +020013764 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013765 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013766 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013767 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013768
13769 str = writer->buffer;
13770 writer->buffer = NULL;
13771
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013772 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013773 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13774 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013775 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013776
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013777 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13778 PyObject *str2;
13779 str2 = resize_compact(str, writer->pos);
13780 if (str2 == NULL) {
13781 Py_DECREF(str);
13782 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013783 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013784 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013785 }
13786
Victor Stinner15a0bd32013-07-08 22:29:55 +020013787 assert(_PyUnicode_CheckConsistency(str, 1));
13788 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013789}
13790
Victor Stinnerd3f08822012-05-29 12:57:52 +020013791void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013792_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013793{
13794 Py_CLEAR(writer->buffer);
13795}
13796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013797#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013798
13799PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013800 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013801\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013802Return a formatted version of S, using substitutions from args and kwargs.\n\
13803The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013804
Eric Smith27bbca62010-11-04 17:06:58 +000013805PyDoc_STRVAR(format_map__doc__,
13806 "S.format_map(mapping) -> str\n\
13807\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013808Return a formatted version of S, using substitutions from mapping.\n\
13809The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013810
Eric Smith4a7d76d2008-05-30 18:10:19 +000013811static PyObject *
13812unicode__format__(PyObject* self, PyObject* args)
13813{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013814 PyObject *format_spec;
13815 _PyUnicodeWriter writer;
13816 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013817
13818 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13819 return NULL;
13820
Victor Stinnerd3f08822012-05-29 12:57:52 +020013821 if (PyUnicode_READY(self) == -1)
13822 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013823 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013824 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13825 self, format_spec, 0,
13826 PyUnicode_GET_LENGTH(format_spec));
13827 if (ret == -1) {
13828 _PyUnicodeWriter_Dealloc(&writer);
13829 return NULL;
13830 }
13831 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013832}
13833
Eric Smith8c663262007-08-25 02:26:07 +000013834PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013835 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013836\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013837Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013838
13839static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013840unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013842 Py_ssize_t size;
13843
13844 /* If it's a compact object, account for base structure +
13845 character data. */
13846 if (PyUnicode_IS_COMPACT_ASCII(v))
13847 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13848 else if (PyUnicode_IS_COMPACT(v))
13849 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013850 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013851 else {
13852 /* If it is a two-block object, account for base object, and
13853 for character block if present. */
13854 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013855 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013856 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013857 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013858 }
13859 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013860 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013861 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013862 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013863 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013864 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013865
13866 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013867}
13868
13869PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013870 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013871
13872static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013873unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013874{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013875 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013876 if (!copy)
13877 return NULL;
13878 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013879}
13880
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013882 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013883 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013884 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13885 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013886 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13887 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013888 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013889 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13890 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13891 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013892 {"expandtabs", (PyCFunction) unicode_expandtabs,
13893 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013894 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013895 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013896 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13897 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13898 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013899 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013900 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13901 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13902 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013903 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013904 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013905 {"splitlines", (PyCFunction) unicode_splitlines,
13906 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013907 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013908 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13909 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13910 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13911 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13912 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13913 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13914 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13915 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13916 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13917 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13918 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13919 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13920 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13921 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013922 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013923 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013924 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013925 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013926 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013927 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013928 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013929 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013930#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013931 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013932 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933#endif
13934
Benjamin Peterson14339b62009-01-31 16:36:08 +000013935 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936 {NULL, NULL}
13937};
13938
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013939static PyObject *
13940unicode_mod(PyObject *v, PyObject *w)
13941{
Brian Curtindfc80e32011-08-10 20:28:54 -050013942 if (!PyUnicode_Check(v))
13943 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013944 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013945}
13946
13947static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013948 0, /*nb_add*/
13949 0, /*nb_subtract*/
13950 0, /*nb_multiply*/
13951 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013952};
13953
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 (lenfunc) unicode_length, /* sq_length */
13956 PyUnicode_Concat, /* sq_concat */
13957 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13958 (ssizeargfunc) unicode_getitem, /* sq_item */
13959 0, /* sq_slice */
13960 0, /* sq_ass_item */
13961 0, /* sq_ass_slice */
13962 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013963};
13964
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013965static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013966unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013968 if (PyUnicode_READY(self) == -1)
13969 return NULL;
13970
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013971 if (PyIndex_Check(item)) {
13972 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013973 if (i == -1 && PyErr_Occurred())
13974 return NULL;
13975 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013976 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013977 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013978 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013979 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013980 PyObject *result;
13981 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013982 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013983 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013984
Serhiy Storchakac26b19d2017-04-08 11:18:14 +030013985 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013986 return NULL;
13987 }
Serhiy Storchakac26b19d2017-04-08 11:18:14 +030013988 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13989 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013990
13991 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013992 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013993 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013994 slicelength == PyUnicode_GET_LENGTH(self)) {
13995 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013996 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013997 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013998 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013999 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014000 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014001 src_kind = PyUnicode_KIND(self);
14002 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014003 if (!PyUnicode_IS_ASCII(self)) {
14004 kind_limit = kind_maxchar_limit(src_kind);
14005 max_char = 0;
14006 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14007 ch = PyUnicode_READ(src_kind, src_data, cur);
14008 if (ch > max_char) {
14009 max_char = ch;
14010 if (max_char >= kind_limit)
14011 break;
14012 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014013 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014014 }
Victor Stinner55c99112011-10-13 01:17:06 +020014015 else
14016 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014017 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014018 if (result == NULL)
14019 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014020 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014021 dest_data = PyUnicode_DATA(result);
14022
14023 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014024 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14025 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014026 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014027 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014028 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014029 } else {
14030 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14031 return NULL;
14032 }
14033}
14034
14035static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014036 (lenfunc)unicode_length, /* mp_length */
14037 (binaryfunc)unicode_subscript, /* mp_subscript */
14038 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014039};
14040
Guido van Rossumd57fd912000-03-10 22:53:23 +000014041
Guido van Rossumd57fd912000-03-10 22:53:23 +000014042/* Helpers for PyUnicode_Format() */
14043
Victor Stinnera47082312012-10-04 02:19:54 +020014044struct unicode_formatter_t {
14045 PyObject *args;
14046 int args_owned;
14047 Py_ssize_t arglen, argidx;
14048 PyObject *dict;
14049
14050 enum PyUnicode_Kind fmtkind;
14051 Py_ssize_t fmtcnt, fmtpos;
14052 void *fmtdata;
14053 PyObject *fmtstr;
14054
14055 _PyUnicodeWriter writer;
14056};
14057
14058struct unicode_format_arg_t {
14059 Py_UCS4 ch;
14060 int flags;
14061 Py_ssize_t width;
14062 int prec;
14063 int sign;
14064};
14065
Guido van Rossumd57fd912000-03-10 22:53:23 +000014066static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014067unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014068{
Victor Stinnera47082312012-10-04 02:19:54 +020014069 Py_ssize_t argidx = ctx->argidx;
14070
14071 if (argidx < ctx->arglen) {
14072 ctx->argidx++;
14073 if (ctx->arglen < 0)
14074 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014075 else
Victor Stinnera47082312012-10-04 02:19:54 +020014076 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014077 }
14078 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014079 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014080 return NULL;
14081}
14082
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014083/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014084
Victor Stinnera47082312012-10-04 02:19:54 +020014085/* Format a float into the writer if the writer is not NULL, or into *p_output
14086 otherwise.
14087
14088 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014089static int
Victor Stinnera47082312012-10-04 02:19:54 +020014090formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14091 PyObject **p_output,
14092 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014093{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014094 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014096 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014097 int prec;
14098 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014099
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100 x = PyFloat_AsDouble(v);
14101 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014102 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014103
Victor Stinnera47082312012-10-04 02:19:54 +020014104 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014105 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014106 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014107
Victor Stinnera47082312012-10-04 02:19:54 +020014108 if (arg->flags & F_ALT)
14109 dtoa_flags = Py_DTSF_ALT;
14110 else
14111 dtoa_flags = 0;
14112 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014113 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014114 return -1;
14115 len = strlen(p);
14116 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014117 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014118 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014119 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014120 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014121 }
14122 else
14123 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014124 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014125 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014126}
14127
Victor Stinnerd0880d52012-04-27 23:40:13 +020014128/* formatlong() emulates the format codes d, u, o, x and X, and
14129 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14130 * Python's regular ints.
14131 * Return value: a new PyUnicodeObject*, or NULL if error.
14132 * The output string is of the form
14133 * "-"? ("0x" | "0X")? digit+
14134 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14135 * set in flags. The case of hex digits will be correct,
14136 * There will be at least prec digits, zero-filled on the left if
14137 * necessary to get that many.
14138 * val object to be converted
14139 * flags bitmask of format flags; only F_ALT is looked at
14140 * prec minimum number of digits; 0-fill on left if needed
14141 * type a character in [duoxX]; u acts the same as d
14142 *
14143 * CAUTION: o, x and X conversions on regular ints can never
14144 * produce a '-' sign, but can for Python's unbounded ints.
14145 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014146PyObject *
14147_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014148{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014149 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014151 Py_ssize_t i;
14152 int sign; /* 1 if '-', else 0 */
14153 int len; /* number of characters */
14154 Py_ssize_t llen;
14155 int numdigits; /* len == numnondigits + numdigits */
14156 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014157
Victor Stinnerd0880d52012-04-27 23:40:13 +020014158 /* Avoid exceeding SSIZE_T_MAX */
14159 if (prec > INT_MAX-3) {
14160 PyErr_SetString(PyExc_OverflowError,
14161 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014162 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014163 }
14164
14165 assert(PyLong_Check(val));
14166
14167 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014168 default:
14169 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014170 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014171 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014172 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014173 /* int and int subclasses should print numerically when a numeric */
14174 /* format code is used (see issue18780) */
14175 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014176 break;
14177 case 'o':
14178 numnondigits = 2;
14179 result = PyNumber_ToBase(val, 8);
14180 break;
14181 case 'x':
14182 case 'X':
14183 numnondigits = 2;
14184 result = PyNumber_ToBase(val, 16);
14185 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014186 }
14187 if (!result)
14188 return NULL;
14189
14190 assert(unicode_modifiable(result));
14191 assert(PyUnicode_IS_READY(result));
14192 assert(PyUnicode_IS_ASCII(result));
14193
14194 /* To modify the string in-place, there can only be one reference. */
14195 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014196 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014197 PyErr_BadInternalCall();
14198 return NULL;
14199 }
14200 buf = PyUnicode_DATA(result);
14201 llen = PyUnicode_GET_LENGTH(result);
14202 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014203 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014204 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014205 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014206 return NULL;
14207 }
14208 len = (int)llen;
14209 sign = buf[0] == '-';
14210 numnondigits += sign;
14211 numdigits = len - numnondigits;
14212 assert(numdigits > 0);
14213
14214 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014215 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014216 (type == 'o' || type == 'x' || type == 'X'))) {
14217 assert(buf[sign] == '0');
14218 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14219 buf[sign+1] == 'o');
14220 numnondigits -= 2;
14221 buf += 2;
14222 len -= 2;
14223 if (sign)
14224 buf[0] = '-';
14225 assert(len == numnondigits + numdigits);
14226 assert(numdigits > 0);
14227 }
14228
14229 /* Fill with leading zeroes to meet minimum width. */
14230 if (prec > numdigits) {
14231 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14232 numnondigits + prec);
14233 char *b1;
14234 if (!r1) {
14235 Py_DECREF(result);
14236 return NULL;
14237 }
14238 b1 = PyBytes_AS_STRING(r1);
14239 for (i = 0; i < numnondigits; ++i)
14240 *b1++ = *buf++;
14241 for (i = 0; i < prec - numdigits; i++)
14242 *b1++ = '0';
14243 for (i = 0; i < numdigits; i++)
14244 *b1++ = *buf++;
14245 *b1 = '\0';
14246 Py_DECREF(result);
14247 result = r1;
14248 buf = PyBytes_AS_STRING(result);
14249 len = numnondigits + prec;
14250 }
14251
14252 /* Fix up case for hex conversions. */
14253 if (type == 'X') {
14254 /* Need to convert all lower case letters to upper case.
14255 and need to convert 0x to 0X (and -0x to -0X). */
14256 for (i = 0; i < len; i++)
14257 if (buf[i] >= 'a' && buf[i] <= 'x')
14258 buf[i] -= 'a'-'A';
14259 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014260 if (!PyUnicode_Check(result)
14261 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014262 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014263 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014264 Py_DECREF(result);
14265 result = unicode;
14266 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014267 else if (len != PyUnicode_GET_LENGTH(result)) {
14268 if (PyUnicode_Resize(&result, len) < 0)
14269 Py_CLEAR(result);
14270 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014272}
14273
Ethan Furmandf3ed242014-01-05 06:50:30 -080014274/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014275 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014276 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 * -1 and raise an exception on error */
14278static int
Victor Stinnera47082312012-10-04 02:19:54 +020014279mainformatlong(PyObject *v,
14280 struct unicode_format_arg_t *arg,
14281 PyObject **p_output,
14282 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014283{
14284 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014285 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014286
14287 if (!PyNumber_Check(v))
14288 goto wrongtype;
14289
Ethan Furman9ab74802014-03-21 06:38:46 -070014290 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014292 if (type == 'o' || type == 'x' || type == 'X') {
14293 iobj = PyNumber_Index(v);
14294 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014295 if (PyErr_ExceptionMatches(PyExc_TypeError))
14296 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014297 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014298 }
14299 }
14300 else {
14301 iobj = PyNumber_Long(v);
14302 if (iobj == NULL ) {
14303 if (PyErr_ExceptionMatches(PyExc_TypeError))
14304 goto wrongtype;
14305 return -1;
14306 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307 }
14308 assert(PyLong_Check(iobj));
14309 }
14310 else {
14311 iobj = v;
14312 Py_INCREF(iobj);
14313 }
14314
14315 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014316 && arg->width == -1 && arg->prec == -1
14317 && !(arg->flags & (F_SIGN | F_BLANK))
14318 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014319 {
14320 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014321 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 int base;
14323
Victor Stinnera47082312012-10-04 02:19:54 +020014324 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014325 {
14326 default:
14327 assert(0 && "'type' not in [diuoxX]");
14328 case 'd':
14329 case 'i':
14330 case 'u':
14331 base = 10;
14332 break;
14333 case 'o':
14334 base = 8;
14335 break;
14336 case 'x':
14337 case 'X':
14338 base = 16;
14339 break;
14340 }
14341
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014342 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14343 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014344 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014345 }
14346 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014347 return 1;
14348 }
14349
Ethan Furmanb95b5612015-01-23 20:05:18 -080014350 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014351 Py_DECREF(iobj);
14352 if (res == NULL)
14353 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014354 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014355 return 0;
14356
14357wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014358 switch(type)
14359 {
14360 case 'o':
14361 case 'x':
14362 case 'X':
14363 PyErr_Format(PyExc_TypeError,
14364 "%%%c format: an integer is required, "
14365 "not %.200s",
14366 type, Py_TYPE(v)->tp_name);
14367 break;
14368 default:
14369 PyErr_Format(PyExc_TypeError,
14370 "%%%c format: a number is required, "
14371 "not %.200s",
14372 type, Py_TYPE(v)->tp_name);
14373 break;
14374 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014375 return -1;
14376}
14377
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014378static Py_UCS4
14379formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014380{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014381 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014382 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014383 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014384 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014385 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014386 goto onError;
14387 }
14388 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014389 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014390 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014391 /* make sure number is a type of integer */
14392 if (!PyLong_Check(v)) {
14393 iobj = PyNumber_Index(v);
14394 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014395 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014396 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014397 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014398 Py_DECREF(iobj);
14399 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014400 else {
14401 x = PyLong_AsLong(v);
14402 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014403 if (x == -1 && PyErr_Occurred())
14404 goto onError;
14405
Victor Stinner8faf8212011-12-08 22:14:11 +010014406 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014407 PyErr_SetString(PyExc_OverflowError,
14408 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014409 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014410 }
14411
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014412 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014414
Benjamin Peterson29060642009-01-31 22:14:21 +000014415 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014416 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014418 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014419}
14420
Victor Stinnera47082312012-10-04 02:19:54 +020014421/* Parse options of an argument: flags, width, precision.
14422 Handle also "%(name)" syntax.
14423
14424 Return 0 if the argument has been formatted into arg->str.
14425 Return 1 if the argument has been written into ctx->writer,
14426 Raise an exception and return -1 on error. */
14427static int
14428unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14429 struct unicode_format_arg_t *arg)
14430{
14431#define FORMAT_READ(ctx) \
14432 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14433
14434 PyObject *v;
14435
Victor Stinnera47082312012-10-04 02:19:54 +020014436 if (arg->ch == '(') {
14437 /* Get argument value from a dictionary. Example: "%(name)s". */
14438 Py_ssize_t keystart;
14439 Py_ssize_t keylen;
14440 PyObject *key;
14441 int pcount = 1;
14442
14443 if (ctx->dict == NULL) {
14444 PyErr_SetString(PyExc_TypeError,
14445 "format requires a mapping");
14446 return -1;
14447 }
14448 ++ctx->fmtpos;
14449 --ctx->fmtcnt;
14450 keystart = ctx->fmtpos;
14451 /* Skip over balanced parentheses */
14452 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14453 arg->ch = FORMAT_READ(ctx);
14454 if (arg->ch == ')')
14455 --pcount;
14456 else if (arg->ch == '(')
14457 ++pcount;
14458 ctx->fmtpos++;
14459 }
14460 keylen = ctx->fmtpos - keystart - 1;
14461 if (ctx->fmtcnt < 0 || pcount > 0) {
14462 PyErr_SetString(PyExc_ValueError,
14463 "incomplete format key");
14464 return -1;
14465 }
14466 key = PyUnicode_Substring(ctx->fmtstr,
14467 keystart, keystart + keylen);
14468 if (key == NULL)
14469 return -1;
14470 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014471 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014472 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014473 }
14474 ctx->args = PyObject_GetItem(ctx->dict, key);
14475 Py_DECREF(key);
14476 if (ctx->args == NULL)
14477 return -1;
14478 ctx->args_owned = 1;
14479 ctx->arglen = -1;
14480 ctx->argidx = -2;
14481 }
14482
14483 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014484 while (--ctx->fmtcnt >= 0) {
14485 arg->ch = FORMAT_READ(ctx);
14486 ctx->fmtpos++;
14487 switch (arg->ch) {
14488 case '-': arg->flags |= F_LJUST; continue;
14489 case '+': arg->flags |= F_SIGN; continue;
14490 case ' ': arg->flags |= F_BLANK; continue;
14491 case '#': arg->flags |= F_ALT; continue;
14492 case '0': arg->flags |= F_ZERO; continue;
14493 }
14494 break;
14495 }
14496
14497 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014498 if (arg->ch == '*') {
14499 v = unicode_format_getnextarg(ctx);
14500 if (v == NULL)
14501 return -1;
14502 if (!PyLong_Check(v)) {
14503 PyErr_SetString(PyExc_TypeError,
14504 "* wants int");
14505 return -1;
14506 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014507 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014508 if (arg->width == -1 && PyErr_Occurred())
14509 return -1;
14510 if (arg->width < 0) {
14511 arg->flags |= F_LJUST;
14512 arg->width = -arg->width;
14513 }
14514 if (--ctx->fmtcnt >= 0) {
14515 arg->ch = FORMAT_READ(ctx);
14516 ctx->fmtpos++;
14517 }
14518 }
14519 else if (arg->ch >= '0' && arg->ch <= '9') {
14520 arg->width = arg->ch - '0';
14521 while (--ctx->fmtcnt >= 0) {
14522 arg->ch = FORMAT_READ(ctx);
14523 ctx->fmtpos++;
14524 if (arg->ch < '0' || arg->ch > '9')
14525 break;
14526 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14527 mixing signed and unsigned comparison. Since arg->ch is between
14528 '0' and '9', casting to int is safe. */
14529 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14530 PyErr_SetString(PyExc_ValueError,
14531 "width too big");
14532 return -1;
14533 }
14534 arg->width = arg->width*10 + (arg->ch - '0');
14535 }
14536 }
14537
14538 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014539 if (arg->ch == '.') {
14540 arg->prec = 0;
14541 if (--ctx->fmtcnt >= 0) {
14542 arg->ch = FORMAT_READ(ctx);
14543 ctx->fmtpos++;
14544 }
14545 if (arg->ch == '*') {
14546 v = unicode_format_getnextarg(ctx);
14547 if (v == NULL)
14548 return -1;
14549 if (!PyLong_Check(v)) {
14550 PyErr_SetString(PyExc_TypeError,
14551 "* wants int");
14552 return -1;
14553 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014554 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014555 if (arg->prec == -1 && PyErr_Occurred())
14556 return -1;
14557 if (arg->prec < 0)
14558 arg->prec = 0;
14559 if (--ctx->fmtcnt >= 0) {
14560 arg->ch = FORMAT_READ(ctx);
14561 ctx->fmtpos++;
14562 }
14563 }
14564 else if (arg->ch >= '0' && arg->ch <= '9') {
14565 arg->prec = arg->ch - '0';
14566 while (--ctx->fmtcnt >= 0) {
14567 arg->ch = FORMAT_READ(ctx);
14568 ctx->fmtpos++;
14569 if (arg->ch < '0' || arg->ch > '9')
14570 break;
14571 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14572 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014573 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014574 return -1;
14575 }
14576 arg->prec = arg->prec*10 + (arg->ch - '0');
14577 }
14578 }
14579 }
14580
14581 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14582 if (ctx->fmtcnt >= 0) {
14583 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14584 if (--ctx->fmtcnt >= 0) {
14585 arg->ch = FORMAT_READ(ctx);
14586 ctx->fmtpos++;
14587 }
14588 }
14589 }
14590 if (ctx->fmtcnt < 0) {
14591 PyErr_SetString(PyExc_ValueError,
14592 "incomplete format");
14593 return -1;
14594 }
14595 return 0;
14596
14597#undef FORMAT_READ
14598}
14599
14600/* Format one argument. Supported conversion specifiers:
14601
14602 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014603 - "i", "d", "u": int or float
14604 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014605 - "e", "E", "f", "F", "g", "G": float
14606 - "c": int or str (1 character)
14607
Victor Stinner8dbd4212012-12-04 09:30:24 +010014608 When possible, the output is written directly into the Unicode writer
14609 (ctx->writer). A string is created when padding is required.
14610
Victor Stinnera47082312012-10-04 02:19:54 +020014611 Return 0 if the argument has been formatted into *p_str,
14612 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014613 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014614static int
14615unicode_format_arg_format(struct unicode_formatter_t *ctx,
14616 struct unicode_format_arg_t *arg,
14617 PyObject **p_str)
14618{
14619 PyObject *v;
14620 _PyUnicodeWriter *writer = &ctx->writer;
14621
14622 if (ctx->fmtcnt == 0)
14623 ctx->writer.overallocate = 0;
14624
14625 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014626 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014627 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014628 return 1;
14629 }
14630
14631 v = unicode_format_getnextarg(ctx);
14632 if (v == NULL)
14633 return -1;
14634
Victor Stinnera47082312012-10-04 02:19:54 +020014635
14636 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014637 case 's':
14638 case 'r':
14639 case 'a':
14640 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14641 /* Fast path */
14642 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14643 return -1;
14644 return 1;
14645 }
14646
14647 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14648 *p_str = v;
14649 Py_INCREF(*p_str);
14650 }
14651 else {
14652 if (arg->ch == 's')
14653 *p_str = PyObject_Str(v);
14654 else if (arg->ch == 'r')
14655 *p_str = PyObject_Repr(v);
14656 else
14657 *p_str = PyObject_ASCII(v);
14658 }
14659 break;
14660
14661 case 'i':
14662 case 'd':
14663 case 'u':
14664 case 'o':
14665 case 'x':
14666 case 'X':
14667 {
14668 int ret = mainformatlong(v, arg, p_str, writer);
14669 if (ret != 0)
14670 return ret;
14671 arg->sign = 1;
14672 break;
14673 }
14674
14675 case 'e':
14676 case 'E':
14677 case 'f':
14678 case 'F':
14679 case 'g':
14680 case 'G':
14681 if (arg->width == -1 && arg->prec == -1
14682 && !(arg->flags & (F_SIGN | F_BLANK)))
14683 {
14684 /* Fast path */
14685 if (formatfloat(v, arg, NULL, writer) == -1)
14686 return -1;
14687 return 1;
14688 }
14689
14690 arg->sign = 1;
14691 if (formatfloat(v, arg, p_str, NULL) == -1)
14692 return -1;
14693 break;
14694
14695 case 'c':
14696 {
14697 Py_UCS4 ch = formatchar(v);
14698 if (ch == (Py_UCS4) -1)
14699 return -1;
14700 if (arg->width == -1 && arg->prec == -1) {
14701 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014702 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014703 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014704 return 1;
14705 }
14706 *p_str = PyUnicode_FromOrdinal(ch);
14707 break;
14708 }
14709
14710 default:
14711 PyErr_Format(PyExc_ValueError,
14712 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014713 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014714 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14715 (int)arg->ch,
14716 ctx->fmtpos - 1);
14717 return -1;
14718 }
14719 if (*p_str == NULL)
14720 return -1;
14721 assert (PyUnicode_Check(*p_str));
14722 return 0;
14723}
14724
14725static int
14726unicode_format_arg_output(struct unicode_formatter_t *ctx,
14727 struct unicode_format_arg_t *arg,
14728 PyObject *str)
14729{
14730 Py_ssize_t len;
14731 enum PyUnicode_Kind kind;
14732 void *pbuf;
14733 Py_ssize_t pindex;
14734 Py_UCS4 signchar;
14735 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014736 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014737 Py_ssize_t sublen;
14738 _PyUnicodeWriter *writer = &ctx->writer;
14739 Py_UCS4 fill;
14740
14741 fill = ' ';
14742 if (arg->sign && arg->flags & F_ZERO)
14743 fill = '0';
14744
14745 if (PyUnicode_READY(str) == -1)
14746 return -1;
14747
14748 len = PyUnicode_GET_LENGTH(str);
14749 if ((arg->width == -1 || arg->width <= len)
14750 && (arg->prec == -1 || arg->prec >= len)
14751 && !(arg->flags & (F_SIGN | F_BLANK)))
14752 {
14753 /* Fast path */
14754 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14755 return -1;
14756 return 0;
14757 }
14758
14759 /* Truncate the string for "s", "r" and "a" formats
14760 if the precision is set */
14761 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14762 if (arg->prec >= 0 && len > arg->prec)
14763 len = arg->prec;
14764 }
14765
14766 /* Adjust sign and width */
14767 kind = PyUnicode_KIND(str);
14768 pbuf = PyUnicode_DATA(str);
14769 pindex = 0;
14770 signchar = '\0';
14771 if (arg->sign) {
14772 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14773 if (ch == '-' || ch == '+') {
14774 signchar = ch;
14775 len--;
14776 pindex++;
14777 }
14778 else if (arg->flags & F_SIGN)
14779 signchar = '+';
14780 else if (arg->flags & F_BLANK)
14781 signchar = ' ';
14782 else
14783 arg->sign = 0;
14784 }
14785 if (arg->width < len)
14786 arg->width = len;
14787
14788 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014789 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014790 if (!(arg->flags & F_LJUST)) {
14791 if (arg->sign) {
14792 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014793 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014794 }
14795 else {
14796 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014797 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014798 }
14799 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014800 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14801 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014802 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014803 }
14804
Victor Stinnera47082312012-10-04 02:19:54 +020014805 buflen = arg->width;
14806 if (arg->sign && len == arg->width)
14807 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014808 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014809 return -1;
14810
14811 /* Write the sign if needed */
14812 if (arg->sign) {
14813 if (fill != ' ') {
14814 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14815 writer->pos += 1;
14816 }
14817 if (arg->width > len)
14818 arg->width--;
14819 }
14820
14821 /* Write the numeric prefix for "x", "X" and "o" formats
14822 if the alternate form is used.
14823 For example, write "0x" for the "%#x" format. */
14824 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14825 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14826 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14827 if (fill != ' ') {
14828 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14829 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14830 writer->pos += 2;
14831 pindex += 2;
14832 }
14833 arg->width -= 2;
14834 if (arg->width < 0)
14835 arg->width = 0;
14836 len -= 2;
14837 }
14838
14839 /* Pad left with the fill character if needed */
14840 if (arg->width > len && !(arg->flags & F_LJUST)) {
14841 sublen = arg->width - len;
14842 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14843 writer->pos += sublen;
14844 arg->width = len;
14845 }
14846
14847 /* If padding with spaces: write sign if needed and/or numeric prefix if
14848 the alternate form is used */
14849 if (fill == ' ') {
14850 if (arg->sign) {
14851 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14852 writer->pos += 1;
14853 }
14854 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14855 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14856 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14857 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14858 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14859 writer->pos += 2;
14860 pindex += 2;
14861 }
14862 }
14863
14864 /* Write characters */
14865 if (len) {
14866 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14867 str, pindex, len);
14868 writer->pos += len;
14869 }
14870
14871 /* Pad right with the fill character if needed */
14872 if (arg->width > len) {
14873 sublen = arg->width - len;
14874 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14875 writer->pos += sublen;
14876 }
14877 return 0;
14878}
14879
14880/* Helper of PyUnicode_Format(): format one arg.
14881 Return 0 on success, raise an exception and return -1 on error. */
14882static int
14883unicode_format_arg(struct unicode_formatter_t *ctx)
14884{
14885 struct unicode_format_arg_t arg;
14886 PyObject *str;
14887 int ret;
14888
Victor Stinner8dbd4212012-12-04 09:30:24 +010014889 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14890 arg.flags = 0;
14891 arg.width = -1;
14892 arg.prec = -1;
14893 arg.sign = 0;
14894 str = NULL;
14895
Victor Stinnera47082312012-10-04 02:19:54 +020014896 ret = unicode_format_arg_parse(ctx, &arg);
14897 if (ret == -1)
14898 return -1;
14899
14900 ret = unicode_format_arg_format(ctx, &arg, &str);
14901 if (ret == -1)
14902 return -1;
14903
14904 if (ret != 1) {
14905 ret = unicode_format_arg_output(ctx, &arg, str);
14906 Py_DECREF(str);
14907 if (ret == -1)
14908 return -1;
14909 }
14910
14911 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14912 PyErr_SetString(PyExc_TypeError,
14913 "not all arguments converted during string formatting");
14914 return -1;
14915 }
14916 return 0;
14917}
14918
Alexander Belopolsky40018472011-02-26 01:02:56 +000014919PyObject *
14920PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014921{
Victor Stinnera47082312012-10-04 02:19:54 +020014922 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014923
Guido van Rossumd57fd912000-03-10 22:53:23 +000014924 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014925 PyErr_BadInternalCall();
14926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014927 }
Victor Stinnera47082312012-10-04 02:19:54 +020014928
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014929 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014930 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014931
14932 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014933 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14934 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14935 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14936 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014937
Victor Stinner8f674cc2013-04-17 23:02:17 +020014938 _PyUnicodeWriter_Init(&ctx.writer);
14939 ctx.writer.min_length = ctx.fmtcnt + 100;
14940 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014941
Guido van Rossumd57fd912000-03-10 22:53:23 +000014942 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014943 ctx.arglen = PyTuple_Size(args);
14944 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014945 }
14946 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014947 ctx.arglen = -1;
14948 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014949 }
Victor Stinnera47082312012-10-04 02:19:54 +020014950 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014951 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014952 ctx.dict = args;
14953 else
14954 ctx.dict = NULL;
14955 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956
Victor Stinnera47082312012-10-04 02:19:54 +020014957 while (--ctx.fmtcnt >= 0) {
14958 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014959 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014960
14961 nonfmtpos = ctx.fmtpos++;
14962 while (ctx.fmtcnt >= 0 &&
14963 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14964 ctx.fmtpos++;
14965 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014966 }
Victor Stinnera47082312012-10-04 02:19:54 +020014967 if (ctx.fmtcnt < 0) {
14968 ctx.fmtpos--;
14969 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014970 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014971
Victor Stinnercfc4c132013-04-03 01:48:39 +020014972 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14973 nonfmtpos, ctx.fmtpos) < 0)
14974 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014975 }
14976 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014977 ctx.fmtpos++;
14978 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014979 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014980 }
14981 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014982
Victor Stinnera47082312012-10-04 02:19:54 +020014983 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014984 PyErr_SetString(PyExc_TypeError,
14985 "not all arguments converted during string formatting");
14986 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014987 }
14988
Victor Stinnera47082312012-10-04 02:19:54 +020014989 if (ctx.args_owned) {
14990 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014991 }
Victor Stinnera47082312012-10-04 02:19:54 +020014992 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014993
Benjamin Peterson29060642009-01-31 22:14:21 +000014994 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014995 _PyUnicodeWriter_Dealloc(&ctx.writer);
14996 if (ctx.args_owned) {
14997 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014998 }
14999 return NULL;
15000}
15001
Jeremy Hylton938ace62002-07-17 16:30:39 +000015002static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015003unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15004
Tim Peters6d6c1a32001-08-02 04:15:00 +000015005static PyObject *
15006unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15007{
Benjamin Peterson29060642009-01-31 22:14:21 +000015008 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015009 static char *kwlist[] = {"object", "encoding", "errors", 0};
15010 char *encoding = NULL;
15011 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015012
Benjamin Peterson14339b62009-01-31 16:36:08 +000015013 if (type != &PyUnicode_Type)
15014 return unicode_subtype_new(type, args, kwds);
15015 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015016 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015017 return NULL;
15018 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015019 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015020 if (encoding == NULL && errors == NULL)
15021 return PyObject_Str(x);
15022 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015023 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015024}
15025
Guido van Rossume023fe02001-08-30 03:12:59 +000015026static PyObject *
15027unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15028{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015029 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015030 Py_ssize_t length, char_size;
15031 int share_wstr, share_utf8;
15032 unsigned int kind;
15033 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015034
Benjamin Peterson14339b62009-01-31 16:36:08 +000015035 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015036
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015037 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015038 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015039 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015040 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015041 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015042 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015043 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015044 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015045
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015046 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015047 if (self == NULL) {
15048 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 return NULL;
15050 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015051 kind = PyUnicode_KIND(unicode);
15052 length = PyUnicode_GET_LENGTH(unicode);
15053
15054 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015055#ifdef Py_DEBUG
15056 _PyUnicode_HASH(self) = -1;
15057#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015059#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015060 _PyUnicode_STATE(self).interned = 0;
15061 _PyUnicode_STATE(self).kind = kind;
15062 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015063 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015064 _PyUnicode_STATE(self).ready = 1;
15065 _PyUnicode_WSTR(self) = NULL;
15066 _PyUnicode_UTF8_LENGTH(self) = 0;
15067 _PyUnicode_UTF8(self) = NULL;
15068 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015069 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015070
15071 share_utf8 = 0;
15072 share_wstr = 0;
15073 if (kind == PyUnicode_1BYTE_KIND) {
15074 char_size = 1;
15075 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15076 share_utf8 = 1;
15077 }
15078 else if (kind == PyUnicode_2BYTE_KIND) {
15079 char_size = 2;
15080 if (sizeof(wchar_t) == 2)
15081 share_wstr = 1;
15082 }
15083 else {
15084 assert(kind == PyUnicode_4BYTE_KIND);
15085 char_size = 4;
15086 if (sizeof(wchar_t) == 4)
15087 share_wstr = 1;
15088 }
15089
15090 /* Ensure we won't overflow the length. */
15091 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15092 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015093 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015094 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015095 data = PyObject_MALLOC((length + 1) * char_size);
15096 if (data == NULL) {
15097 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015098 goto onError;
15099 }
15100
Victor Stinnerc3c74152011-10-02 20:39:55 +020015101 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015102 if (share_utf8) {
15103 _PyUnicode_UTF8_LENGTH(self) = length;
15104 _PyUnicode_UTF8(self) = data;
15105 }
15106 if (share_wstr) {
15107 _PyUnicode_WSTR_LENGTH(self) = length;
15108 _PyUnicode_WSTR(self) = (wchar_t *)data;
15109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015110
Christian Heimesf051e432016-09-13 20:22:02 +020015111 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015112 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015113 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015114#ifdef Py_DEBUG
15115 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15116#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015117 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015118 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015119
15120onError:
15121 Py_DECREF(unicode);
15122 Py_DECREF(self);
15123 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015124}
15125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015126PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015127"str(object='') -> str\n\
15128str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015129\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015130Create a new string object from the given object. If encoding or\n\
15131errors is specified, then the object must expose a data buffer\n\
15132that will be decoded using the given encoding and error handler.\n\
15133Otherwise, returns the result of object.__str__() (if defined)\n\
15134or repr(object).\n\
15135encoding defaults to sys.getdefaultencoding().\n\
15136errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015137
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015138static PyObject *unicode_iter(PyObject *seq);
15139
Guido van Rossumd57fd912000-03-10 22:53:23 +000015140PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015141 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 "str", /* tp_name */
15143 sizeof(PyUnicodeObject), /* tp_size */
15144 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015145 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015146 (destructor)unicode_dealloc, /* tp_dealloc */
15147 0, /* tp_print */
15148 0, /* tp_getattr */
15149 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015150 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 unicode_repr, /* tp_repr */
15152 &unicode_as_number, /* tp_as_number */
15153 &unicode_as_sequence, /* tp_as_sequence */
15154 &unicode_as_mapping, /* tp_as_mapping */
15155 (hashfunc) unicode_hash, /* tp_hash*/
15156 0, /* tp_call*/
15157 (reprfunc) unicode_str, /* tp_str */
15158 PyObject_GenericGetAttr, /* tp_getattro */
15159 0, /* tp_setattro */
15160 0, /* tp_as_buffer */
15161 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015162 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015163 unicode_doc, /* tp_doc */
15164 0, /* tp_traverse */
15165 0, /* tp_clear */
15166 PyUnicode_RichCompare, /* tp_richcompare */
15167 0, /* tp_weaklistoffset */
15168 unicode_iter, /* tp_iter */
15169 0, /* tp_iternext */
15170 unicode_methods, /* tp_methods */
15171 0, /* tp_members */
15172 0, /* tp_getset */
15173 &PyBaseObject_Type, /* tp_base */
15174 0, /* tp_dict */
15175 0, /* tp_descr_get */
15176 0, /* tp_descr_set */
15177 0, /* tp_dictoffset */
15178 0, /* tp_init */
15179 0, /* tp_alloc */
15180 unicode_new, /* tp_new */
15181 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015182};
15183
15184/* Initialize the Unicode implementation */
15185
Victor Stinner3a50e702011-10-18 21:21:00 +020015186int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015187{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015188 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015189 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015190 0x000A, /* LINE FEED */
15191 0x000D, /* CARRIAGE RETURN */
15192 0x001C, /* FILE SEPARATOR */
15193 0x001D, /* GROUP SEPARATOR */
15194 0x001E, /* RECORD SEPARATOR */
15195 0x0085, /* NEXT LINE */
15196 0x2028, /* LINE SEPARATOR */
15197 0x2029, /* PARAGRAPH SEPARATOR */
15198 };
15199
Fred Drakee4315f52000-05-09 19:53:39 +000015200 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015201 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015202 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015203 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015204 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015205
Guido van Rossumcacfc072002-05-24 19:01:59 +000015206 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015207 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015208
15209 /* initialize the linebreak bloom filter */
15210 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015211 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015212 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015213
Christian Heimes26532f72013-07-20 14:57:16 +020015214 if (PyType_Ready(&EncodingMapType) < 0)
15215 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015216
Benjamin Petersonc4311282012-10-30 23:21:10 -040015217 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15218 Py_FatalError("Can't initialize field name iterator type");
15219
15220 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15221 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015222
Victor Stinner3a50e702011-10-18 21:21:00 +020015223 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015224}
15225
15226/* Finalize the Unicode implementation */
15227
Christian Heimesa156e092008-02-16 07:38:31 +000015228int
15229PyUnicode_ClearFreeList(void)
15230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015231 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015232}
15233
Guido van Rossumd57fd912000-03-10 22:53:23 +000015234void
Thomas Wouters78890102000-07-22 19:25:51 +000015235_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015236{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015237 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015238
Serhiy Storchaka05997252013-01-26 12:14:02 +020015239 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015240
Serhiy Storchaka05997252013-01-26 12:14:02 +020015241 for (i = 0; i < 256; i++)
15242 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015243 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015244 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015245}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015246
Walter Dörwald16807132007-05-25 13:52:07 +000015247void
15248PyUnicode_InternInPlace(PyObject **p)
15249{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015250 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015252#ifdef Py_DEBUG
15253 assert(s != NULL);
15254 assert(_PyUnicode_CHECK(s));
15255#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015257 return;
15258#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015259 /* If it's a subclass, we don't really know what putting
15260 it in the interned dict might do. */
15261 if (!PyUnicode_CheckExact(s))
15262 return;
15263 if (PyUnicode_CHECK_INTERNED(s))
15264 return;
15265 if (interned == NULL) {
15266 interned = PyDict_New();
15267 if (interned == NULL) {
15268 PyErr_Clear(); /* Don't leave an exception */
15269 return;
15270 }
15271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015273 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015274 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015275 if (t == NULL) {
15276 PyErr_Clear();
15277 return;
15278 }
15279 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015280 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015281 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015282 return;
15283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 /* The two references in interned are not counted by refcnt.
15285 The deallocator will take care of this */
15286 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015287 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015288}
15289
15290void
15291PyUnicode_InternImmortal(PyObject **p)
15292{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 PyUnicode_InternInPlace(p);
15294 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015295 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015296 Py_INCREF(*p);
15297 }
Walter Dörwald16807132007-05-25 13:52:07 +000015298}
15299
15300PyObject *
15301PyUnicode_InternFromString(const char *cp)
15302{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 PyObject *s = PyUnicode_FromString(cp);
15304 if (s == NULL)
15305 return NULL;
15306 PyUnicode_InternInPlace(&s);
15307 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015308}
15309
Alexander Belopolsky40018472011-02-26 01:02:56 +000015310void
15311_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015312{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015313 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015314 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 Py_ssize_t i, n;
15316 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015317
Benjamin Peterson14339b62009-01-31 16:36:08 +000015318 if (interned == NULL || !PyDict_Check(interned))
15319 return;
15320 keys = PyDict_Keys(interned);
15321 if (keys == NULL || !PyList_Check(keys)) {
15322 PyErr_Clear();
15323 return;
15324 }
Walter Dörwald16807132007-05-25 13:52:07 +000015325
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15327 detector, interned unicode strings are not forcibly deallocated;
15328 rather, we give them their stolen references back, and then clear
15329 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015330
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 n = PyList_GET_SIZE(keys);
15332 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015333 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015335 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015336 if (PyUnicode_READY(s) == -1) {
15337 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015338 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015340 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 case SSTATE_NOT_INTERNED:
15342 /* XXX Shouldn't happen */
15343 break;
15344 case SSTATE_INTERNED_IMMORTAL:
15345 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015346 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 break;
15348 case SSTATE_INTERNED_MORTAL:
15349 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015350 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 break;
15352 default:
15353 Py_FatalError("Inconsistent interned string state.");
15354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015355 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 }
15357 fprintf(stderr, "total size of all interned strings: "
15358 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15359 "mortal/immortal\n", mortal_size, immortal_size);
15360 Py_DECREF(keys);
15361 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015362 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015363}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015364
15365
15366/********************* Unicode Iterator **************************/
15367
15368typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 PyObject_HEAD
15370 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015371 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015372} unicodeiterobject;
15373
15374static void
15375unicodeiter_dealloc(unicodeiterobject *it)
15376{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 _PyObject_GC_UNTRACK(it);
15378 Py_XDECREF(it->it_seq);
15379 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015380}
15381
15382static int
15383unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15384{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015385 Py_VISIT(it->it_seq);
15386 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015387}
15388
15389static PyObject *
15390unicodeiter_next(unicodeiterobject *it)
15391{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015392 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015393
Benjamin Peterson14339b62009-01-31 16:36:08 +000015394 assert(it != NULL);
15395 seq = it->it_seq;
15396 if (seq == NULL)
15397 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015398 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015400 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15401 int kind = PyUnicode_KIND(seq);
15402 void *data = PyUnicode_DATA(seq);
15403 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15404 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015405 if (item != NULL)
15406 ++it->it_index;
15407 return item;
15408 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015409
Benjamin Peterson14339b62009-01-31 16:36:08 +000015410 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015411 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015412 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015413}
15414
15415static PyObject *
15416unicodeiter_len(unicodeiterobject *it)
15417{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 Py_ssize_t len = 0;
15419 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015420 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015421 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015422}
15423
15424PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15425
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015426static PyObject *
15427unicodeiter_reduce(unicodeiterobject *it)
15428{
15429 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015430 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015431 it->it_seq, it->it_index);
15432 } else {
15433 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15434 if (u == NULL)
15435 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015436 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015437 }
15438}
15439
15440PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15441
15442static PyObject *
15443unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15444{
15445 Py_ssize_t index = PyLong_AsSsize_t(state);
15446 if (index == -1 && PyErr_Occurred())
15447 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015448 if (it->it_seq != NULL) {
15449 if (index < 0)
15450 index = 0;
15451 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15452 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15453 it->it_index = index;
15454 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015455 Py_RETURN_NONE;
15456}
15457
15458PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15459
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015460static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015461 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015462 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015463 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15464 reduce_doc},
15465 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15466 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015467 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015468};
15469
15470PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015471 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15472 "str_iterator", /* tp_name */
15473 sizeof(unicodeiterobject), /* tp_basicsize */
15474 0, /* tp_itemsize */
15475 /* methods */
15476 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15477 0, /* tp_print */
15478 0, /* tp_getattr */
15479 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015480 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015481 0, /* tp_repr */
15482 0, /* tp_as_number */
15483 0, /* tp_as_sequence */
15484 0, /* tp_as_mapping */
15485 0, /* tp_hash */
15486 0, /* tp_call */
15487 0, /* tp_str */
15488 PyObject_GenericGetAttr, /* tp_getattro */
15489 0, /* tp_setattro */
15490 0, /* tp_as_buffer */
15491 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15492 0, /* tp_doc */
15493 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15494 0, /* tp_clear */
15495 0, /* tp_richcompare */
15496 0, /* tp_weaklistoffset */
15497 PyObject_SelfIter, /* tp_iter */
15498 (iternextfunc)unicodeiter_next, /* tp_iternext */
15499 unicodeiter_methods, /* tp_methods */
15500 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015501};
15502
15503static PyObject *
15504unicode_iter(PyObject *seq)
15505{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015506 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015507
Benjamin Peterson14339b62009-01-31 16:36:08 +000015508 if (!PyUnicode_Check(seq)) {
15509 PyErr_BadInternalCall();
15510 return NULL;
15511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015512 if (PyUnicode_READY(seq) == -1)
15513 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015514 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15515 if (it == NULL)
15516 return NULL;
15517 it->it_index = 0;
15518 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015519 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015520 _PyObject_GC_TRACK(it);
15521 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015522}
15523
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015524
15525size_t
15526Py_UNICODE_strlen(const Py_UNICODE *u)
15527{
15528 int res = 0;
15529 while(*u++)
15530 res++;
15531 return res;
15532}
15533
15534Py_UNICODE*
15535Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15536{
15537 Py_UNICODE *u = s1;
15538 while ((*u++ = *s2++));
15539 return s1;
15540}
15541
15542Py_UNICODE*
15543Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15544{
15545 Py_UNICODE *u = s1;
15546 while ((*u++ = *s2++))
15547 if (n-- == 0)
15548 break;
15549 return s1;
15550}
15551
15552Py_UNICODE*
15553Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15554{
15555 Py_UNICODE *u1 = s1;
15556 u1 += Py_UNICODE_strlen(u1);
15557 Py_UNICODE_strcpy(u1, s2);
15558 return s1;
15559}
15560
15561int
15562Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15563{
15564 while (*s1 && *s2 && *s1 == *s2)
15565 s1++, s2++;
15566 if (*s1 && *s2)
15567 return (*s1 < *s2) ? -1 : +1;
15568 if (*s1)
15569 return 1;
15570 if (*s2)
15571 return -1;
15572 return 0;
15573}
15574
15575int
15576Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15577{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015578 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015579 for (; n != 0; n--) {
15580 u1 = *s1;
15581 u2 = *s2;
15582 if (u1 != u2)
15583 return (u1 < u2) ? -1 : +1;
15584 if (u1 == '\0')
15585 return 0;
15586 s1++;
15587 s2++;
15588 }
15589 return 0;
15590}
15591
15592Py_UNICODE*
15593Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15594{
15595 const Py_UNICODE *p;
15596 for (p = s; *p; p++)
15597 if (*p == c)
15598 return (Py_UNICODE*)p;
15599 return NULL;
15600}
15601
15602Py_UNICODE*
15603Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15604{
15605 const Py_UNICODE *p;
15606 p = s + Py_UNICODE_strlen(s);
15607 while (p != s) {
15608 p--;
15609 if (*p == c)
15610 return (Py_UNICODE*)p;
15611 }
15612 return NULL;
15613}
Victor Stinner331ea922010-08-10 16:37:20 +000015614
Victor Stinner71133ff2010-09-01 23:43:53 +000015615Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015616PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015617{
Victor Stinner577db2c2011-10-11 22:12:48 +020015618 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015619 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015621 if (!PyUnicode_Check(unicode)) {
15622 PyErr_BadArgument();
15623 return NULL;
15624 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015625 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015626 if (u == NULL)
15627 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015628 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015629 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015630 PyErr_NoMemory();
15631 return NULL;
15632 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015633 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015634 size *= sizeof(Py_UNICODE);
15635 copy = PyMem_Malloc(size);
15636 if (copy == NULL) {
15637 PyErr_NoMemory();
15638 return NULL;
15639 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015640 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015641 return copy;
15642}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015643
Georg Brandl66c221e2010-10-14 07:04:07 +000015644/* A _string module, to export formatter_parser and formatter_field_name_split
15645 to the string.Formatter class implemented in Python. */
15646
15647static PyMethodDef _string_methods[] = {
15648 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15649 METH_O, PyDoc_STR("split the argument as a field name")},
15650 {"formatter_parser", (PyCFunction) formatter_parser,
15651 METH_O, PyDoc_STR("parse the argument as a format string")},
15652 {NULL, NULL}
15653};
15654
15655static struct PyModuleDef _string_module = {
15656 PyModuleDef_HEAD_INIT,
15657 "_string",
15658 PyDoc_STR("string helper module"),
15659 0,
15660 _string_methods,
15661 NULL,
15662 NULL,
15663 NULL,
15664 NULL
15665};
15666
15667PyMODINIT_FUNC
15668PyInit__string(void)
15669{
15670 return PyModule_Create(&_string_module);
15671}
15672
15673
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015674#ifdef __cplusplus
15675}
15676#endif