blob: 4e0c663e338a5269c1d1b280af999a9a4b90e31c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700207static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700723static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700829static inline Py_ssize_t
830findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200849 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200854 default:
855 assert(0);
856 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Benjamin Petersonbac79492012-01-14 13:34:47 -05001032 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001033 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 Py_ssize_t i;
1515
Victor Stinnera0702ab2011-09-29 14:14:38 +02001516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001518 if (ch > to_maxchar)
1519 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 }
1523 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return 0;
1525}
1526
Victor Stinnerd3f08822012-05-29 12:57:52 +02001527void
1528_PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531{
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539{
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001549 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 return -1;
1551
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
Victor Stinner488fa492011-12-12 00:01:39 +01001576 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589}
1590
Victor Stinner17222162011-09-28 22:15:37 +02001591/* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599{
1600 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001601 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602
Victor Stinnerc53be962011-10-02 21:33:54 +02001603 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 }
1617 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 return 0;
1634}
1635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001636int
1637_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644#endif
1645
Georg Brandl7597add2011-10-05 16:36:47 +02001646 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001654 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
1663 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 PyErr_NoMemory();
1667 return -1;
1668 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
Victor Stinner506f5922011-09-28 22:34:18 +02001695#if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001703#else
1704 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001707 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001708 PyErr_NoMemory();
1709 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 }
Victor Stinner506f5922011-09-28 22:34:18 +02001711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726#if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750 assert(num_surrogates == 0);
1751
Victor Stinnerc3c74152011-10-02 20:39:55 +02001752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 return 0;
1763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001766unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Walter Dörwald16807132007-05-25 13:52:07 +00001768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 case SSTATE_NOT_INTERNED:
1770 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001771
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001775 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 default:
1784 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001785 }
1786
Victor Stinner03490912011-10-03 23:45:12 +02001787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001790 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001794 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811}
1812#endif
1813
Alexander Belopolsky40018472011-02-26 01:02:56 +00001814static int
Victor Stinner488fa492011-12-12 00:01:39 +01001815unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001816{
Victor Stinner488fa492011-12-12 00:01:39 +01001817 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001826#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001829#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001830 return 1;
1831}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
Victor Stinner910337b2011-10-03 03:20:16 +02001846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001857 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 return 0;
1859 }
1860
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001865 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867 }
1868
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001872 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001873 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001876 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001881{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001894}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001895
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001897
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001904{
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001907 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001912#ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001920 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001921 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001932 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
Victor Stinner184252a2012-06-16 02:57:41 +02001940 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 }
1945 }
1946}
1947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
Victor Stinnera464fc12011-10-02 20:39:30 +02001951 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001953 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001961 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinner985a82a2014-01-03 12:53:47 +01001964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
Victor Stinner985a82a2014-01-03 12:53:47 +01001974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
1977 switch (PyUnicode_KIND(unicode)) {
1978 case PyUnicode_1BYTE_KIND:
1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980 break;
1981 case PyUnicode_2BYTE_KIND:
1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983 break;
1984 default:
1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987 }
1988 assert(_PyUnicode_CheckConsistency(unicode, 1));
1989 return unicode;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992PyObject *
1993PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001995 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 Py_UCS4 maxchar = 0;
1997 Py_ssize_t num_surrogates;
1998
1999 if (u == NULL)
2000 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002002 /* If the Unicode data is known at construction time, we can apply
2003 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002006 if (size == 0)
2007 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 /* Single character Unicode objects in the Latin-1 range are
2010 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002011 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 return get_latin1_char((unsigned char)*u);
2013
2014 /* If not empty and not single character, copy the Unicode data
2015 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002016 if (find_maxchar_surrogates(u, u + size,
2017 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 return NULL;
2019
Victor Stinner8faf8212011-12-08 22:14:11 +01002020 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 if (!unicode)
2022 return NULL;
2023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 switch (PyUnicode_KIND(unicode)) {
2025 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028 break;
2029 case PyUnicode_2BYTE_KIND:
2030#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035#endif
2036 break;
2037 case PyUnicode_4BYTE_KIND:
2038#if SIZEOF_WCHAR_T == 2
2039 /* This is the only case which has to process surrogates, thus
2040 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002041 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#else
2043 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045#endif
2046 break;
2047 default:
2048 assert(0 && "Impossible state");
2049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002051 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052}
2053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 if (size < 0) {
2058 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002060 return NULL;
2061 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002062 if (u != NULL)
2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064 else
2065 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002066}
2067
Alexander Belopolsky40018472011-02-26 01:02:56 +00002068PyObject *
2069PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002070{
2071 size_t size = strlen(u);
2072 if (size > PY_SSIZE_T_MAX) {
2073 PyErr_SetString(PyExc_OverflowError, "input too long");
2074 return NULL;
2075 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002077}
2078
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079PyObject *
2080_PyUnicode_FromId(_Py_Identifier *id)
2081{
2082 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002083 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084 strlen(id->string),
2085 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002086 if (!id->object)
2087 return NULL;
2088 PyUnicode_InternInPlace(&id->object);
2089 assert(!id->next);
2090 id->next = static_strings;
2091 static_strings = id;
2092 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002093 return id->object;
2094}
2095
2096void
2097_PyUnicode_ClearStaticStrings()
2098{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002099 _Py_Identifier *tmp, *s = static_strings;
2100 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002101 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002102 tmp = s->next;
2103 s->next = NULL;
2104 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002105 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002106 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107}
2108
Benjamin Peterson0df54292012-03-26 14:50:32 -04002109/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002110
Victor Stinnerd3f08822012-05-29 12:57:52 +02002111PyObject*
2112_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002113{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002114 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002115 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002117#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002118 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002119#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002120 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002121 }
Victor Stinner785938e2011-12-11 20:09:03 +01002122 unicode = PyUnicode_New(size, 127);
2123 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002124 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126 assert(_PyUnicode_CheckConsistency(unicode, 1));
2127 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002128}
2129
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002130static Py_UCS4
2131kind_maxchar_limit(unsigned int kind)
2132{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002133 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002134 case PyUnicode_1BYTE_KIND:
2135 return 0x80;
2136 case PyUnicode_2BYTE_KIND:
2137 return 0x100;
2138 case PyUnicode_4BYTE_KIND:
2139 return 0x10000;
2140 default:
2141 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002142 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002143 }
2144}
2145
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002146static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002147align_maxchar(Py_UCS4 maxchar)
2148{
2149 if (maxchar <= 127)
2150 return 127;
2151 else if (maxchar <= 255)
2152 return 255;
2153 else if (maxchar <= 65535)
2154 return 65535;
2155 else
2156 return MAX_UNICODE;
2157}
2158
Victor Stinner702c7342011-10-05 13:50:52 +02002159static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002160_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002164
Serhiy Storchaka678db842013-01-26 12:16:36 +02002165 if (size == 0)
2166 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002168 if (size == 1)
2169 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002171 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002172 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (!res)
2174 return NULL;
2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002176 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002178}
2179
Victor Stinnere57b1c02011-09-28 22:20:48 +02002180static PyObject*
2181_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182{
2183 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002184 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002185
Serhiy Storchaka678db842013-01-26 12:16:36 +02002186 if (size == 0)
2187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002188 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002189 if (size == 1)
2190 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002192 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002193 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 if (!res)
2195 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002198 else {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
Victor Stinnere57b1c02011-09-28 22:20:48 +02002206static PyObject*
2207_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208{
2209 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002211
Serhiy Storchaka678db842013-01-26 12:16:36 +02002212 if (size == 0)
2213 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002215 if (size == 1)
2216 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002218 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (!res)
2221 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 if (max_char < 256)
2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224 PyUnicode_1BYTE_DATA(res));
2225 else if (max_char < 0x10000)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227 PyUnicode_2BYTE_DATA(res));
2228 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002230 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 return res;
2232}
2233
2234PyObject*
2235PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002237 if (size < 0) {
2238 PyErr_SetString(PyExc_ValueError, "size must be positive");
2239 return NULL;
2240 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002241 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002243 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002245 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002247 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002248 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002249 PyErr_SetString(PyExc_SystemError, "invalid kind");
2250 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252}
2253
Victor Stinnerece58de2012-04-23 23:36:38 +02002254Py_UCS4
2255_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256{
2257 enum PyUnicode_Kind kind;
2258 void *startptr, *endptr;
2259
2260 assert(PyUnicode_IS_READY(unicode));
2261 assert(0 <= start);
2262 assert(end <= PyUnicode_GET_LENGTH(unicode));
2263 assert(start <= end);
2264
2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266 return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268 if (start == end)
2269 return 127;
2270
Victor Stinner94d558b2012-04-27 22:26:58 +02002271 if (PyUnicode_IS_ASCII(unicode))
2272 return 127;
2273
Victor Stinnerece58de2012-04-23 23:36:38 +02002274 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002275 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002276 endptr = (char *)startptr + end * kind;
2277 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002278 switch(kind) {
2279 case PyUnicode_1BYTE_KIND:
2280 return ucs1lib_find_max_char(startptr, endptr);
2281 case PyUnicode_2BYTE_KIND:
2282 return ucs2lib_find_max_char(startptr, endptr);
2283 case PyUnicode_4BYTE_KIND:
2284 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002285 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002286 assert(0);
2287 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002288 }
2289}
2290
Victor Stinner25a4b292011-10-06 12:31:55 +02002291/* Ensure that a string uses the most efficient storage, if it is not the
2292 case: create a new string with of the right kind. Write NULL into *p_unicode
2293 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002294static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002295unicode_adjust_maxchar(PyObject **p_unicode)
2296{
2297 PyObject *unicode, *copy;
2298 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 unsigned int kind;
2301
2302 assert(p_unicode != NULL);
2303 unicode = *p_unicode;
2304 assert(PyUnicode_IS_READY(unicode));
2305 if (PyUnicode_IS_ASCII(unicode))
2306 return;
2307
2308 len = PyUnicode_GET_LENGTH(unicode);
2309 kind = PyUnicode_KIND(unicode);
2310 if (kind == PyUnicode_1BYTE_KIND) {
2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002312 max_char = ucs1lib_find_max_char(u, u + len);
2313 if (max_char >= 128)
2314 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002315 }
2316 else if (kind == PyUnicode_2BYTE_KIND) {
2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs2lib_find_max_char(u, u + len);
2319 if (max_char >= 256)
2320 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002321 }
2322 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 max_char = ucs4lib_find_max_char(u, u + len);
2326 if (max_char >= 0x10000)
2327 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002328 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002330 if (copy != NULL)
2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 Py_DECREF(unicode);
2333 *p_unicode = copy;
2334}
2335
Victor Stinner034f6cf2011-09-30 02:26:44 +02002336PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002337_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338{
Victor Stinner87af4f22011-11-21 23:03:47 +01002339 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002340 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 if (!PyUnicode_Check(unicode)) {
2343 PyErr_BadInternalCall();
2344 return NULL;
2345 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002346 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348
Victor Stinner87af4f22011-11-21 23:03:47 +01002349 length = PyUnicode_GET_LENGTH(unicode);
2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002351 if (!copy)
2352 return NULL;
2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
Christian Heimesf051e432016-09-13 20:22:02 +02002355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002356 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002357 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002358 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002359}
2360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361
Victor Stinnerbc603d12011-10-02 01:00:40 +02002362/* Widen Unicode objects to larger buffers. Don't write terminating null
2363 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364
2365void*
2366_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 Py_ssize_t len;
2369 void *result;
2370 unsigned int skind;
2371
Benjamin Petersonbac79492012-01-14 13:34:47 -05002372 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373 return NULL;
2374
2375 len = PyUnicode_GET_LENGTH(s);
2376 skind = PyUnicode_KIND(s);
2377 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002382 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002383 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 if (!result)
2385 return PyErr_NoMemory();
2386 assert(skind == PyUnicode_1BYTE_KIND);
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS1, Py_UCS2,
2389 PyUnicode_1BYTE_DATA(s),
2390 PyUnicode_1BYTE_DATA(s) + len,
2391 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002394 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 if (!result)
2396 return PyErr_NoMemory();
2397 if (skind == PyUnicode_2BYTE_KIND) {
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS2, Py_UCS4,
2400 PyUnicode_2BYTE_DATA(s),
2401 PyUnicode_2BYTE_DATA(s) + len,
2402 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404 else {
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS4,
2408 PyUnicode_1BYTE_DATA(s),
2409 PyUnicode_1BYTE_DATA(s) + len,
2410 result);
2411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002413 default:
2414 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 }
Victor Stinner01698042011-10-04 00:04:26 +02002416 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return NULL;
2418}
2419
2420static Py_UCS4*
2421as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422 int copy_null)
2423{
2424 int kind;
2425 void *data;
2426 Py_ssize_t len, targetlen;
2427 if (PyUnicode_READY(string) == -1)
2428 return NULL;
2429 kind = PyUnicode_KIND(string);
2430 data = PyUnicode_DATA(string);
2431 len = PyUnicode_GET_LENGTH(string);
2432 targetlen = len;
2433 if (copy_null)
2434 targetlen++;
2435 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 if (!target) {
2438 PyErr_NoMemory();
2439 return NULL;
2440 }
2441 }
2442 else {
2443 if (targetsize < targetlen) {
2444 PyErr_Format(PyExc_SystemError,
2445 "string is longer than the buffer");
2446 if (copy_null && 0 < targetsize)
2447 target[0] = 0;
2448 return NULL;
2449 }
2450 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 if (kind == PyUnicode_1BYTE_KIND) {
2452 Py_UCS1 *start = (Py_UCS1 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 Py_UCS2 *start = (Py_UCS2 *) data;
2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458 }
2459 else {
2460 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002461 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (copy_null)
2464 target[len] = 0;
2465 return target;
2466}
2467
2468Py_UCS4*
2469PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470 int copy_null)
2471{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002472 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
2476 return as_ucs4(string, target, targetsize, copy_null);
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4Copy(PyObject *string)
2481{
2482 return as_ucs4(string, NULL, 0, 1);
2483}
2484
2485#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002486
Alexander Belopolsky40018472011-02-26 01:02:56 +00002487PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002488PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002492 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 PyErr_BadInternalCall();
2494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496
Martin v. Löwis790465f2008-04-05 20:41:37 +00002497 if (size == -1) {
2498 size = wcslen(w);
2499 }
2500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502}
2503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002505
Victor Stinner15a11362012-10-06 23:48:20 +02002506/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2509#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002510
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002511static int
2512unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513 Py_ssize_t width, Py_ssize_t precision)
2514{
2515 Py_ssize_t length, fill, arglen;
2516 Py_UCS4 maxchar;
2517
2518 if (PyUnicode_READY(str) == -1)
2519 return -1;
2520
2521 length = PyUnicode_GET_LENGTH(str);
2522 if ((precision == -1 || precision >= length)
2523 && width <= length)
2524 return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526 if (precision != -1)
2527 length = Py_MIN(precision, length);
2528
2529 arglen = Py_MAX(length, width);
2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532 else
2533 maxchar = writer->maxchar;
2534
2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536 return -1;
2537
2538 if (width > length) {
2539 fill = width - length;
2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541 return -1;
2542 writer->pos += fill;
2543 }
2544
2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546 str, 0, length);
2547 writer->pos += length;
2548 return 0;
2549}
2550
2551static int
2552unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553 Py_ssize_t width, Py_ssize_t precision)
2554{
2555 /* UTF-8 */
2556 Py_ssize_t length;
2557 PyObject *unicode;
2558 int res;
2559
2560 length = strlen(str);
2561 if (precision != -1)
2562 length = Py_MIN(length, precision);
2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564 if (unicode == NULL)
2565 return -1;
2566
2567 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568 Py_DECREF(unicode);
2569 return res;
2570}
2571
Victor Stinner96865452011-03-01 23:44:09 +00002572static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002573unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002575{
Victor Stinnere215d962012-10-06 23:03:36 +02002576 const char *p;
2577 Py_ssize_t len;
2578 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t width;
2580 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002581 int longflag;
2582 int longlongflag;
2583 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002584 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585
2586 p = f;
2587 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002588 zeropad = 0;
2589 if (*f == '0') {
2590 zeropad = 1;
2591 f++;
2592 }
Victor Stinner96865452011-03-01 23:44:09 +00002593
2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 width = -1;
2596 if (Py_ISDIGIT((unsigned)*f)) {
2597 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002598 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002599 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002601 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002603 return NULL;
2604 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002606 f++;
2607 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 }
2609 precision = -1;
2610 if (*f == '.') {
2611 f++;
2612 if (Py_ISDIGIT((unsigned)*f)) {
2613 precision = (*f - '0');
2614 f++;
2615 while (Py_ISDIGIT((unsigned)*f)) {
2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617 PyErr_SetString(PyExc_ValueError,
2618 "precision too big");
2619 return NULL;
2620 }
2621 precision = (precision * 10) + (*f - '0');
2622 f++;
2623 }
2624 }
Victor Stinner96865452011-03-01 23:44:09 +00002625 if (*f == '%') {
2626 /* "%.3%s" => f points to "3" */
2627 f--;
2628 }
2629 }
2630 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002631 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002632 f--;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634
2635 /* Handle %ld, %lu, %lld and %llu. */
2636 longflag = 0;
2637 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002638 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002639 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002641 longflag = 1;
2642 ++f;
2643 }
Victor Stinner96865452011-03-01 23:44:09 +00002644 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002646 longlongflag = 1;
2647 f += 2;
2648 }
Victor Stinner96865452011-03-01 23:44:09 +00002649 }
2650 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002652 size_tflag = 1;
2653 ++f;
2654 }
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (f[1] == '\0')
2657 writer->overallocate = 0;
2658
2659 switch (*f) {
2660 case 'c':
2661 {
2662 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002663 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002664 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002665 "character argument not in range(0x110000)");
2666 return NULL;
2667 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002669 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002670 break;
2671 }
2672
2673 case 'i':
2674 case 'd':
2675 case 'u':
2676 case 'x':
2677 {
2678 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002679 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002680 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002681
2682 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002683 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002684 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002685 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002686 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002687 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002688 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002689 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002691 va_arg(*vargs, size_t));
2692 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned int));
2695 }
2696 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 }
2699 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002703 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002704 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002705 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002706 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002708 va_arg(*vargs, Py_ssize_t));
2709 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, int));
2712 }
2713 assert(len >= 0);
2714
Victor Stinnere215d962012-10-06 23:03:36 +02002715 if (precision < len)
2716 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717
2718 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720 return NULL;
2721
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (width > precision) {
2723 Py_UCS4 fillchar;
2724 fill = width - precision;
2725 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727 return NULL;
2728 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 }
Victor Stinner15a11362012-10-06 23:48:20 +02002730 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002731 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733 return NULL;
2734 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002736
Victor Stinner4a587072013-11-19 12:54:53 +01002737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002739 break;
2740 }
2741
2742 case 'p':
2743 {
2744 char number[MAX_LONG_LONG_CHARS];
2745
2746 len = sprintf(number, "%p", va_arg(*vargs, void*));
2747 assert(len >= 0);
2748
2749 /* %p is ill-defined: ensure leading 0x. */
2750 if (number[1] == 'X')
2751 number[1] = 'x';
2752 else if (number[1] != 'x') {
2753 memmove(number + 2, number,
2754 strlen(number) + 1);
2755 number[0] = '0';
2756 number[1] = 'x';
2757 len += 2;
2758 }
2759
Victor Stinner4a587072013-11-19 12:54:53 +01002760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002761 return NULL;
2762 break;
2763 }
2764
2765 case 's':
2766 {
2767 /* UTF-8 */
2768 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002771 break;
2772 }
2773
2774 case 'U':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 assert(obj && _PyUnicode_CHECK(obj));
2778
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002780 return NULL;
2781 break;
2782 }
2783
2784 case 'V':
2785 {
2786 PyObject *obj = va_arg(*vargs, PyObject *);
2787 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002788 if (obj) {
2789 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 }
2793 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 assert(str != NULL);
2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002796 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002797 }
2798 break;
2799 }
2800
2801 case 'S':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 PyObject *str;
2805 assert(obj);
2806 str = PyObject_Str(obj);
2807 if (!str)
2808 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002810 Py_DECREF(str);
2811 return NULL;
2812 }
2813 Py_DECREF(str);
2814 break;
2815 }
2816
2817 case 'R':
2818 {
2819 PyObject *obj = va_arg(*vargs, PyObject *);
2820 PyObject *repr;
2821 assert(obj);
2822 repr = PyObject_Repr(obj);
2823 if (!repr)
2824 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 Py_DECREF(repr);
2827 return NULL;
2828 }
2829 Py_DECREF(repr);
2830 break;
2831 }
2832
2833 case 'A':
2834 {
2835 PyObject *obj = va_arg(*vargs, PyObject *);
2836 PyObject *ascii;
2837 assert(obj);
2838 ascii = PyObject_ASCII(obj);
2839 if (!ascii)
2840 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 Py_DECREF(ascii);
2843 return NULL;
2844 }
2845 Py_DECREF(ascii);
2846 break;
2847 }
2848
2849 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002852 break;
2853
2854 default:
2855 /* if we stumble upon an unknown formatting code, copy the rest
2856 of the format string to the output string. (we cannot just
2857 skip the code, since there's no way to know what's in the
2858 argument list) */
2859 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
2862 f = p+len;
2863 return f;
2864 }
2865
2866 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002867 return f;
2868}
2869
Walter Dörwaldd2034312007-05-18 16:29:38 +00002870PyObject *
2871PyUnicode_FromFormatV(const char *format, va_list vargs)
2872{
Victor Stinnere215d962012-10-06 23:03:36 +02002873 va_list vargs2;
2874 const char *f;
2875 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876
Victor Stinner8f674cc2013-04-17 23:02:17 +02002877 _PyUnicodeWriter_Init(&writer);
2878 writer.min_length = strlen(format) + 100;
2879 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002880
Benjamin Peterson0c212142016-09-20 20:39:33 -07002881 // Copy varags to be able to pass a reference to a subfunction.
2882 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002885 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002886 f = unicode_fromformat_arg(&writer, f, &vargs2);
2887 if (f == NULL)
2888 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 const char *p;
2892 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002893
Victor Stinnere215d962012-10-06 23:03:36 +02002894 p = f;
2895 do
2896 {
2897 if ((unsigned char)*p > 127) {
2898 PyErr_Format(PyExc_ValueError,
2899 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900 "string, got a non-ASCII byte: 0x%02x",
2901 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002902 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 }
2904 p++;
2905 }
2906 while (*p != '\0' && *p != '%');
2907 len = p - f;
2908
2909 if (*p == '\0')
2910 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002911
2912 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914
2915 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002918 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002919 return _PyUnicodeWriter_Finish(&writer);
2920
2921 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002922 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002923 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925}
2926
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 PyObject* ret;
2931 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932
2933#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#ifdef HAVE_WCHAR_H
2944
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) required to convert the unicode object. Ignore size argument.
2950
Victor Stinnerd88d9832011-09-06 02:00:05 +02002951 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002953 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 wchar_t *w,
2957 Py_ssize_t size)
2958{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 const wchar_t *wstr;
2961
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (wstr == NULL)
2964 return -1;
2965
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002971 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 return res;
2973 }
2974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002976}
2977
2978Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002979PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 wchar_t *w,
2981 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982{
2983 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyErr_BadInternalCall();
2985 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003002 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 if (buflen == -1)
3004 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003005 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003010 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003014 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 if (size != NULL)
3016 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 return buffer;
3018}
3019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024{
Victor Stinner8faf8212011-12-08 22:14:11 +01003025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 PyErr_SetString(PyExc_ValueError,
3027 "chr() arg not in range(0x110000)");
3028 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003029 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003030
Victor Stinner985a82a2014-01-03 12:53:47 +01003031 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032}
3033
Alexander Belopolsky40018472011-02-26 01:02:56 +00003034PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003035PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003040 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003041 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 Py_INCREF(obj);
3043 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
3045 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 /* For a Unicode subtype that's not a Unicode object,
3047 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003048 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003049 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003050 PyErr_Format(PyExc_TypeError,
3051 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003052 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003053 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003057PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_BadInternalCall();
3066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 /* Decoding bytes objects is the most common case and should be fast */
3070 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 if (PyBytes_GET_SIZE(obj) == 0)
3072 _Py_RETURN_UNICODE_EMPTY();
3073 v = PyUnicode_Decode(
3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 return v;
3077 }
3078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 PyErr_SetString(PyExc_TypeError,
3081 "decoding str is not supported");
3082 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003084
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003088 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 Py_TYPE(obj)->tp_name);
3090 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003091 }
Tim Petersced69f82003-09-16 20:30:58 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003094 PyBuffer_Release(&buffer);
3095 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003097
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003100 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101}
3102
Victor Stinnerebe17e02016-10-12 13:57:45 +02003103/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3104 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3105 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003106int
3107_Py_normalize_encoding(const char *encoding,
3108 char *lower,
3109 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003111 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003112 char *l;
3113 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003114 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115
Victor Stinner942889a2016-09-05 15:40:10 -07003116 assert(encoding != NULL);
3117
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003118 e = encoding;
3119 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003120 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003121 punct = 0;
3122 while (1) {
3123 char c = *e;
3124 if (c == 0) {
3125 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 }
Victor Stinner942889a2016-09-05 15:40:10 -07003127
3128 if (Py_ISALNUM(c) || c == '.') {
3129 if (punct && l != lower) {
3130 if (l == l_end) {
3131 return 0;
3132 }
3133 *l++ = '_';
3134 }
3135 punct = 0;
3136
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003141 }
3142 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003143 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003144 }
Victor Stinner942889a2016-09-05 15:40:10 -07003145
3146 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003149 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 Py_ssize_t size,
3155 const char *encoding,
3156 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003157{
3158 PyObject *buffer = NULL, *unicode;
3159 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003160 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161
3162 if (encoding == NULL) {
3163 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003165
Fred Drakee4315f52000-05-09 19:53:39 +00003166 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003167 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168 char *lower = buflower;
3169
3170 /* Fast paths */
3171 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172 lower += 3;
3173 if (*lower == '_') {
3174 /* Match "utf8" and "utf_8" */
3175 lower++;
3176 }
3177
3178 if (lower[0] == '8' && lower[1] == 0) {
3179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180 }
3181 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183 }
3184 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186 }
3187 }
3188 else {
3189 if (strcmp(lower, "ascii") == 0
3190 || strcmp(lower, "us_ascii") == 0) {
3191 return PyUnicode_DecodeASCII(s, size, errors);
3192 }
Steve Dowercc16be82016-09-08 10:35:16 -07003193 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003194 else if (strcmp(lower, "mbcs") == 0) {
3195 return PyUnicode_DecodeMBCS(s, size, errors);
3196 }
3197 #endif
3198 else if (strcmp(lower, "latin1") == 0
3199 || strcmp(lower, "latin_1") == 0
3200 || strcmp(lower, "iso_8859_1") == 0
3201 || strcmp(lower, "iso8859_1") == 0) {
3202 return PyUnicode_DecodeLatin1(s, size, errors);
3203 }
3204 }
Victor Stinner37296e82010-06-10 13:36:23 +00003205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
3207 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003208 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003210 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003211 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (buffer == NULL)
3213 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003214 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 if (unicode == NULL)
3216 goto onError;
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003219 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220 "use codecs.decode() to decode to arbitrary types",
3221 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003222 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_DECREF(unicode);
3224 goto onError;
3225 }
3226 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003227 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003228
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 Py_XDECREF(buffer);
3231 return NULL;
3232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
3235PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003236 const char *encoding,
3237 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003239 if (!PyUnicode_Check(unicode)) {
3240 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003241 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242 }
3243
Serhiy Storchaka00939072016-10-27 21:05:49 +03003244 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3245 "PyUnicode_AsDecodedObject() is deprecated; "
3246 "use PyCodec_Decode() to decode from str", 1) < 0)
3247 return NULL;
3248
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251
3252 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003253 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
3257PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003260{
3261 PyObject *v;
3262
3263 if (!PyUnicode_Check(unicode)) {
3264 PyErr_BadArgument();
3265 goto onError;
3266 }
3267
Serhiy Storchaka00939072016-10-27 21:05:49 +03003268 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3269 "PyUnicode_AsDecodedUnicode() is deprecated; "
3270 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3271 return NULL;
3272
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003273 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003275
3276 /* Decode via the codec registry */
3277 v = PyCodec_Decode(unicode, encoding, errors);
3278 if (v == NULL)
3279 goto onError;
3280 if (!PyUnicode_Check(v)) {
3281 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003282 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3283 "use codecs.decode() to decode to arbitrary types",
3284 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003285 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286 Py_DECREF(v);
3287 goto onError;
3288 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003289 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292 return NULL;
3293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
3296PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 Py_ssize_t size,
3298 const char *encoding,
3299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300{
3301 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 unicode = PyUnicode_FromUnicode(s, size);
3304 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3307 Py_DECREF(unicode);
3308 return v;
3309}
3310
Alexander Belopolsky40018472011-02-26 01:02:56 +00003311PyObject *
3312PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003313 const char *encoding,
3314 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003315{
3316 PyObject *v;
3317
3318 if (!PyUnicode_Check(unicode)) {
3319 PyErr_BadArgument();
3320 goto onError;
3321 }
3322
Serhiy Storchaka00939072016-10-27 21:05:49 +03003323 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3324 "PyUnicode_AsEncodedObject() is deprecated; "
3325 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3326 "or PyCodec_Encode() for generic encoding", 1) < 0)
3327 return NULL;
3328
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003329 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003331
3332 /* Encode via the codec registry */
3333 v = PyCodec_Encode(unicode, encoding, errors);
3334 if (v == NULL)
3335 goto onError;
3336 return v;
3337
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003339 return NULL;
3340}
3341
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342static size_t
3343wcstombs_errorpos(const wchar_t *wstr)
3344{
3345 size_t len;
3346#if SIZEOF_WCHAR_T == 2
3347 wchar_t buf[3];
3348#else
3349 wchar_t buf[2];
3350#endif
3351 char outbuf[MB_LEN_MAX];
3352 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003354#if SIZEOF_WCHAR_T == 2
3355 buf[2] = 0;
3356#else
3357 buf[1] = 0;
3358#endif
3359 start = wstr;
3360 while (*wstr != L'\0')
3361 {
3362 previous = wstr;
3363#if SIZEOF_WCHAR_T == 2
3364 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3365 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3366 {
3367 buf[0] = wstr[0];
3368 buf[1] = wstr[1];
3369 wstr += 2;
3370 }
3371 else {
3372 buf[0] = *wstr;
3373 buf[1] = 0;
3374 wstr++;
3375 }
3376#else
3377 buf[0] = *wstr;
3378 wstr++;
3379#endif
3380 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003381 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003382 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 }
3384
3385 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 return 0;
3387}
3388
Victor Stinner1b579672011-12-17 05:47:23 +01003389static int
3390locale_error_handler(const char *errors, int *surrogateescape)
3391{
Victor Stinner50149202015-09-22 00:26:54 +02003392 _Py_error_handler error_handler = get_error_handler(errors);
3393 switch (error_handler)
3394 {
3395 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003396 *surrogateescape = 0;
3397 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003398 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003399 *surrogateescape = 1;
3400 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003401 default:
3402 PyErr_Format(PyExc_ValueError,
3403 "only 'strict' and 'surrogateescape' error handlers "
3404 "are supported, not '%s'",
3405 errors);
3406 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003407 }
Victor Stinner1b579672011-12-17 05:47:23 +01003408}
3409
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003411PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412{
3413 Py_ssize_t wlen, wlen2;
3414 wchar_t *wstr;
3415 PyObject *bytes = NULL;
3416 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003417 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 PyObject *exc;
3419 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003420 int surrogateescape;
3421
3422 if (locale_error_handler(errors, &surrogateescape) < 0)
3423 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424
3425 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3426 if (wstr == NULL)
3427 return NULL;
3428
3429 wlen2 = wcslen(wstr);
3430 if (wlen2 != wlen) {
3431 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003432 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433 return NULL;
3434 }
3435
3436 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003437 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003438 char *str;
3439
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003440 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003441 if (str == NULL) {
3442 if (error_pos == (size_t)-1) {
3443 PyErr_NoMemory();
3444 PyMem_Free(wstr);
3445 return NULL;
3446 }
3447 else {
3448 goto encode_error;
3449 }
3450 }
3451 PyMem_Free(wstr);
3452
3453 bytes = PyBytes_FromString(str);
3454 PyMem_Free(str);
3455 }
3456 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003457 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003458 size_t len, len2;
3459
3460 len = wcstombs(NULL, wstr, 0);
3461 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003462 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003463 goto encode_error;
3464 }
3465
3466 bytes = PyBytes_FromStringAndSize(NULL, len);
3467 if (bytes == NULL) {
3468 PyMem_Free(wstr);
3469 return NULL;
3470 }
3471
3472 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3473 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003474 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003475 goto encode_error;
3476 }
3477 PyMem_Free(wstr);
3478 }
3479 return bytes;
3480
3481encode_error:
3482 errmsg = strerror(errno);
3483 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003484
3485 if (error_pos == (size_t)-1)
3486 error_pos = wcstombs_errorpos(wstr);
3487
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003488 PyMem_Free(wstr);
3489 Py_XDECREF(bytes);
3490
Victor Stinner2f197072011-12-17 07:08:30 +01003491 if (errmsg != NULL) {
3492 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003493 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003494 if (wstr != NULL) {
3495 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003496 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003497 } else
3498 errmsg = NULL;
3499 }
3500 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003501 reason = PyUnicode_FromString(
3502 "wcstombs() encountered an unencodable "
3503 "wide character");
3504 if (reason == NULL)
3505 return NULL;
3506
3507 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3508 "locale", unicode,
3509 (Py_ssize_t)error_pos,
3510 (Py_ssize_t)(error_pos+1),
3511 reason);
3512 Py_DECREF(reason);
3513 if (exc != NULL) {
3514 PyCodec_StrictErrors(exc);
3515 Py_XDECREF(exc);
3516 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003517 return NULL;
3518}
3519
Victor Stinnerad158722010-10-27 00:25:46 +00003520PyObject *
3521PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003522{
Steve Dowercc16be82016-09-08 10:35:16 -07003523#if defined(__APPLE__)
3524 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003525#else
Victor Stinner793b5312011-04-27 00:24:21 +02003526 PyInterpreterState *interp = PyThreadState_GET()->interp;
3527 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3528 cannot use it to encode and decode filenames before it is loaded. Load
3529 the Python codec requires to encode at least its own filename. Use the C
3530 version of the locale codec until the codec registry is initialized and
3531 the Python codec is loaded.
3532
3533 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3534 cannot only rely on it: check also interp->fscodec_initialized for
3535 subinterpreters. */
3536 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003537 return PyUnicode_AsEncodedString(unicode,
3538 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003539 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003540 }
3541 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003542 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003543 }
Victor Stinnerad158722010-10-27 00:25:46 +00003544#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545}
3546
Alexander Belopolsky40018472011-02-26 01:02:56 +00003547PyObject *
3548PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003549 const char *encoding,
3550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551{
3552 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003553 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003554
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 if (!PyUnicode_Check(unicode)) {
3556 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 }
Fred Drakee4315f52000-05-09 19:53:39 +00003559
Victor Stinner942889a2016-09-05 15:40:10 -07003560 if (encoding == NULL) {
3561 return _PyUnicode_AsUTF8String(unicode, errors);
3562 }
3563
Fred Drakee4315f52000-05-09 19:53:39 +00003564 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003565 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566 char *lower = buflower;
3567
3568 /* Fast paths */
3569 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570 lower += 3;
3571 if (*lower == '_') {
3572 /* Match "utf8" and "utf_8" */
3573 lower++;
3574 }
3575
3576 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003577 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003578 }
3579 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3581 }
3582 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3584 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003585 }
Victor Stinner942889a2016-09-05 15:40:10 -07003586 else {
3587 if (strcmp(lower, "ascii") == 0
3588 || strcmp(lower, "us_ascii") == 0) {
3589 return _PyUnicode_AsASCIIString(unicode, errors);
3590 }
Steve Dowercc16be82016-09-08 10:35:16 -07003591#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003592 else if (strcmp(lower, "mbcs") == 0) {
3593 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3594 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003595#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003596 else if (strcmp(lower, "latin1") == 0 ||
3597 strcmp(lower, "latin_1") == 0 ||
3598 strcmp(lower, "iso_8859_1") == 0 ||
3599 strcmp(lower, "iso8859_1") == 0) {
3600 return _PyUnicode_AsLatin1String(unicode, errors);
3601 }
3602 }
Victor Stinner37296e82010-06-10 13:36:23 +00003603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604
3605 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003606 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003608 return NULL;
3609
3610 /* The normal path */
3611 if (PyBytes_Check(v))
3612 return v;
3613
3614 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003616 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003617 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003618
3619 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003620 "encoder %s returned bytearray instead of bytes; "
3621 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003622 encoding);
3623 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003624 Py_DECREF(v);
3625 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003626 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003628 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3629 Py_DECREF(v);
3630 return b;
3631 }
3632
3633 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003634 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3635 "use codecs.encode() to encode to arbitrary types",
3636 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003637 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003638 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003639 return NULL;
3640}
3641
Alexander Belopolsky40018472011-02-26 01:02:56 +00003642PyObject *
3643PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003644 const char *encoding,
3645 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003646{
3647 PyObject *v;
3648
3649 if (!PyUnicode_Check(unicode)) {
3650 PyErr_BadArgument();
3651 goto onError;
3652 }
3653
Serhiy Storchaka00939072016-10-27 21:05:49 +03003654 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3655 "PyUnicode_AsEncodedUnicode() is deprecated; "
3656 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3657 return NULL;
3658
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661
3662 /* Encode via the codec registry */
3663 v = PyCodec_Encode(unicode, encoding, errors);
3664 if (v == NULL)
3665 goto onError;
3666 if (!PyUnicode_Check(v)) {
3667 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003668 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3669 "use codecs.encode() to encode to arbitrary types",
3670 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003671 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672 Py_DECREF(v);
3673 goto onError;
3674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003676
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 return NULL;
3679}
3680
Victor Stinner2f197072011-12-17 07:08:30 +01003681static size_t
3682mbstowcs_errorpos(const char *str, size_t len)
3683{
3684#ifdef HAVE_MBRTOWC
3685 const char *start = str;
3686 mbstate_t mbs;
3687 size_t converted;
3688 wchar_t ch;
3689
3690 memset(&mbs, 0, sizeof mbs);
3691 while (len)
3692 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003693 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003694 if (converted == 0)
3695 /* Reached end of string */
3696 break;
3697 if (converted == (size_t)-1 || converted == (size_t)-2) {
3698 /* Conversion error or incomplete character */
3699 return str - start;
3700 }
3701 else {
3702 str += converted;
3703 len -= converted;
3704 }
3705 }
3706 /* failed to find the undecodable byte sequence */
3707 return 0;
3708#endif
3709 return 0;
3710}
3711
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003712PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003713PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003714 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715{
3716 wchar_t smallbuf[256];
3717 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3718 wchar_t *wstr;
3719 size_t wlen, wlen2;
3720 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003721 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003722 size_t error_pos;
3723 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003724 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3725 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003726
3727 if (locale_error_handler(errors, &surrogateescape) < 0)
3728 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003729
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003730 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3731 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732 return NULL;
3733 }
3734
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003735 if (surrogateescape) {
3736 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003737 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003738 if (wstr == NULL) {
3739 if (wlen == (size_t)-1)
3740 PyErr_NoMemory();
3741 else
3742 PyErr_SetFromErrno(PyExc_OSError);
3743 return NULL;
3744 }
3745
3746 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003747 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003748 }
3749 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003750 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751#ifndef HAVE_BROKEN_MBSTOWCS
3752 wlen = mbstowcs(NULL, str, 0);
3753#else
3754 wlen = len;
3755#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003756 if (wlen == (size_t)-1)
3757 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003758 if (wlen+1 <= smallbuf_len) {
3759 wstr = smallbuf;
3760 }
3761 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003762 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003763 if (!wstr)
3764 return PyErr_NoMemory();
3765 }
3766
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003767 wlen2 = mbstowcs(wstr, str, wlen+1);
3768 if (wlen2 == (size_t)-1) {
3769 if (wstr != smallbuf)
3770 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003771 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003772 }
3773#ifdef HAVE_BROKEN_MBSTOWCS
3774 assert(wlen2 == wlen);
3775#endif
3776 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3777 if (wstr != smallbuf)
3778 PyMem_Free(wstr);
3779 }
3780 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003781
3782decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003783 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003784 errmsg = strerror(errno);
3785 assert(errmsg != NULL);
3786
3787 error_pos = mbstowcs_errorpos(str, len);
3788 if (errmsg != NULL) {
3789 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003790 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003791 if (wstr != NULL) {
3792 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003793 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003794 }
Victor Stinner2f197072011-12-17 07:08:30 +01003795 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003796 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003797 reason = PyUnicode_FromString(
3798 "mbstowcs() encountered an invalid multibyte sequence");
3799 if (reason == NULL)
3800 return NULL;
3801
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803 "locale", str, len,
3804 (Py_ssize_t)error_pos,
3805 (Py_ssize_t)(error_pos+1),
3806 reason);
3807 Py_DECREF(reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
3810 Py_XDECREF(exc);
3811 }
3812 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003813}
3814
3815PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003816PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817{
3818 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003819 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003820}
3821
3822
3823PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003824PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003825 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003828
Christian Heimes5894ba72007-11-04 11:43:14 +00003829PyObject*
3830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831{
Steve Dowercc16be82016-09-08 10:35:16 -07003832#if defined(__APPLE__)
3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003834#else
Victor Stinner793b5312011-04-27 00:24:21 +02003835 PyInterpreterState *interp = PyThreadState_GET()->interp;
3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837 cannot use it to encode and decode filenames before it is loaded. Load
3838 the Python codec requires to encode at least its own filename. Use the C
3839 version of the locale codec until the codec registry is initialized and
3840 the Python codec is loaded.
3841
3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843 cannot only rely on it: check also interp->fscodec_initialized for
3844 subinterpreters. */
3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003846 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003848 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849 }
3850 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003851 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852 }
Victor Stinnerad158722010-10-27 00:25:46 +00003853#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854}
3855
Martin v. Löwis011e8422009-05-05 04:43:17 +00003856
3857int
3858PyUnicode_FSConverter(PyObject* arg, void* addr)
3859{
Brett Cannonec6ce872016-09-06 15:50:29 -07003860 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003861 PyObject *output = NULL;
3862 Py_ssize_t size;
3863 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003864 if (arg == NULL) {
3865 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003866 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003867 return 1;
3868 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 path = PyOS_FSPath(arg);
3870 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003871 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 if (PyBytes_Check(path)) {
3874 output = path;
3875 }
3876 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3877 output = PyUnicode_EncodeFSDefault(path);
3878 Py_DECREF(path);
3879 if (!output) {
3880 return 0;
3881 }
3882 assert(PyBytes_Check(output));
3883 }
3884
Victor Stinner0ea2a462010-04-30 00:22:08 +00003885 size = PyBytes_GET_SIZE(output);
3886 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003887 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003888 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003889 Py_DECREF(output);
3890 return 0;
3891 }
3892 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003894}
3895
3896
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003897int
3898PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899{
Brett Cannona5711202016-09-06 19:36:01 -07003900 int is_buffer = 0;
3901 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003903 if (arg == NULL) {
3904 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka7a113a02017-04-20 22:55:06 +03003905 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906 return 1;
3907 }
Brett Cannona5711202016-09-06 19:36:01 -07003908
3909 is_buffer = PyObject_CheckBuffer(arg);
3910 if (!is_buffer) {
3911 path = PyOS_FSPath(arg);
3912 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003913 return 0;
3914 }
Brett Cannona5711202016-09-06 19:36:01 -07003915 }
3916 else {
3917 path = arg;
3918 Py_INCREF(arg);
3919 }
3920
3921 if (PyUnicode_Check(path)) {
3922 if (PyUnicode_READY(path) == -1) {
3923 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003924 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003925 }
3926 output = path;
3927 }
3928 else if (PyBytes_Check(path) || is_buffer) {
3929 PyObject *path_bytes = NULL;
3930
3931 if (!PyBytes_Check(path) &&
3932 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3933 "path should be string, bytes, or os.PathLike, not %.200s",
3934 Py_TYPE(arg)->tp_name)) {
3935 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003937 }
3938 path_bytes = PyBytes_FromObject(path);
3939 Py_DECREF(path);
3940 if (!path_bytes) {
3941 return 0;
3942 }
3943 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3944 PyBytes_GET_SIZE(path_bytes));
3945 Py_DECREF(path_bytes);
3946 if (!output) {
3947 return 0;
3948 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003949 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003950 else {
3951 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003952 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003954 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003955 return 0;
3956 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003957 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003958 Py_DECREF(output);
3959 return 0;
3960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003962 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003963 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003964 Py_DECREF(output);
3965 return 0;
3966 }
3967 *(PyObject**)addr = output;
3968 return Py_CLEANUP_SUPPORTED;
3969}
3970
3971
Martin v. Löwis5b222132007-06-10 09:51:05 +00003972char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003974{
Christian Heimesf3863112007-11-22 07:46:41 +00003975 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003977 if (!PyUnicode_Check(unicode)) {
3978 PyErr_BadArgument();
3979 return NULL;
3980 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003981 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003982 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003984 if (PyUnicode_UTF8(unicode) == NULL) {
3985 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003986 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 if (bytes == NULL)
3988 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3990 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003991 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 Py_DECREF(bytes);
3993 return NULL;
3994 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003996 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 PyBytes_AS_STRING(bytes),
3998 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 Py_DECREF(bytes);
4000 }
4001
4002 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004003 *psize = PyUnicode_UTF8_LENGTH(unicode);
4004 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004005}
4006
4007char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4011}
4012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013Py_UNICODE *
4014PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 const unsigned char *one_byte;
4017#if SIZEOF_WCHAR_T == 4
4018 const Py_UCS2 *two_bytes;
4019#else
4020 const Py_UCS4 *four_bytes;
4021 const Py_UCS4 *ucs4_end;
4022 Py_ssize_t num_surrogates;
4023#endif
4024 wchar_t *w;
4025 wchar_t *wchar_end;
4026
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 return NULL;
4030 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004033 assert(_PyUnicode_KIND(unicode) != 0);
4034 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4039 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 num_surrogates = 0;
4041
4042 for (; four_bytes < ucs4_end; ++four_bytes) {
4043 if (*four_bytes > 0xFFFF)
4044 ++num_surrogates;
4045 }
4046
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004047 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4048 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4049 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyErr_NoMemory();
4051 return NULL;
4052 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 w = _PyUnicode_WSTR(unicode);
4056 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4057 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4059 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004060 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004062 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4063 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 }
4065 else
4066 *w = *four_bytes;
4067
4068 if (w > wchar_end) {
4069 assert(0 && "Miscalculated string end");
4070 }
4071 }
4072 *w = 0;
4073#else
4074 /* sizeof(wchar_t) == 4 */
4075 Py_FatalError("Impossible unicode object state, wstr and str "
4076 "should share memory already.");
4077 return NULL;
4078#endif
4079 }
4080 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004081 if ((size_t)_PyUnicode_LENGTH(unicode) >
4082 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4083 PyErr_NoMemory();
4084 return NULL;
4085 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004086 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4087 (_PyUnicode_LENGTH(unicode) + 1));
4088 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 PyErr_NoMemory();
4090 return NULL;
4091 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4093 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4094 w = _PyUnicode_WSTR(unicode);
4095 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004097 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4098 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 for (; w < wchar_end; ++one_byte, ++w)
4100 *w = *one_byte;
4101 /* null-terminate the wstr */
4102 *w = 0;
4103 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004104 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 for (; w < wchar_end; ++two_bytes, ++w)
4108 *w = *two_bytes;
4109 /* null-terminate the wstr */
4110 *w = 0;
4111#else
4112 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004113 PyObject_FREE(_PyUnicode_WSTR(unicode));
4114 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 Py_FatalError("Impossible unicode object state, wstr "
4116 "and str should share memory already.");
4117 return NULL;
4118#endif
4119 }
4120 else {
4121 assert(0 && "This should never happen.");
4122 }
4123 }
4124 }
4125 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004126 *size = PyUnicode_WSTR_LENGTH(unicode);
4127 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004128}
4129
Alexander Belopolsky40018472011-02-26 01:02:56 +00004130Py_UNICODE *
4131PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134}
4135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136
Alexander Belopolsky40018472011-02-26 01:02:56 +00004137Py_ssize_t
4138PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139{
4140 if (!PyUnicode_Check(unicode)) {
4141 PyErr_BadArgument();
4142 goto onError;
4143 }
4144 return PyUnicode_GET_SIZE(unicode);
4145
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 return -1;
4148}
4149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150Py_ssize_t
4151PyUnicode_GetLength(PyObject *unicode)
4152{
Victor Stinner07621332012-06-16 04:53:46 +02004153 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154 PyErr_BadArgument();
4155 return -1;
4156 }
Victor Stinner07621332012-06-16 04:53:46 +02004157 if (PyUnicode_READY(unicode) == -1)
4158 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 return PyUnicode_GET_LENGTH(unicode);
4160}
4161
4162Py_UCS4
4163PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4164{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004165 void *data;
4166 int kind;
4167
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004168 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4169 PyErr_BadArgument();
4170 return (Py_UCS4)-1;
4171 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004172 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004173 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174 return (Py_UCS4)-1;
4175 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004176 data = PyUnicode_DATA(unicode);
4177 kind = PyUnicode_KIND(unicode);
4178 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179}
4180
4181int
4182PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4183{
4184 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004185 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186 return -1;
4187 }
Victor Stinner488fa492011-12-12 00:01:39 +01004188 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004189 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004190 PyErr_SetString(PyExc_IndexError, "string index out of range");
4191 return -1;
4192 }
Victor Stinner488fa492011-12-12 00:01:39 +01004193 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004194 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004195 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4196 PyErr_SetString(PyExc_ValueError, "character out of range");
4197 return -1;
4198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004199 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4200 index, ch);
4201 return 0;
4202}
4203
Alexander Belopolsky40018472011-02-26 01:02:56 +00004204const char *
4205PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004206{
Victor Stinner42cb4622010-09-01 19:39:01 +00004207 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004208}
4209
Victor Stinner554f3f02010-06-16 23:33:54 +00004210/* create or adjust a UnicodeDecodeError */
4211static void
4212make_decode_exception(PyObject **exceptionObject,
4213 const char *encoding,
4214 const char *input, Py_ssize_t length,
4215 Py_ssize_t startpos, Py_ssize_t endpos,
4216 const char *reason)
4217{
4218 if (*exceptionObject == NULL) {
4219 *exceptionObject = PyUnicodeDecodeError_Create(
4220 encoding, input, length, startpos, endpos, reason);
4221 }
4222 else {
4223 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4224 goto onError;
4225 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4226 goto onError;
4227 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4228 goto onError;
4229 }
4230 return;
4231
4232onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004233 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004234}
4235
Steve Dowercc16be82016-09-08 10:35:16 -07004236#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237/* error handling callback helper:
4238 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004239 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 and adjust various state variables.
4241 return 0 on success, -1 on error
4242*/
4243
Alexander Belopolsky40018472011-02-26 01:02:56 +00004244static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245unicode_decode_call_errorhandler_wchar(
4246 const char *errors, PyObject **errorHandler,
4247 const char *encoding, const char *reason,
4248 const char **input, const char **inend, Py_ssize_t *startinpos,
4249 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4250 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004252 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253
4254 PyObject *restuple = NULL;
4255 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004256 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004257 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004258 Py_ssize_t requiredsize;
4259 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004260 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004261 wchar_t *repwstr;
4262 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4265 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 *errorHandler = PyCodec_LookupError(errors);
4269 if (*errorHandler == NULL)
4270 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 }
4272
Victor Stinner554f3f02010-06-16 23:33:54 +00004273 make_decode_exception(exceptionObject,
4274 encoding,
4275 *input, *inend - *input,
4276 *startinpos, *endinpos,
4277 reason);
4278 if (*exceptionObject == NULL)
4279 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280
4281 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4282 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004285 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 }
4288 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290
4291 /* Copy back the bytes variables, which might have been modified by the
4292 callback */
4293 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4294 if (!inputobj)
4295 goto onError;
4296 if (!PyBytes_Check(inputobj)) {
4297 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4298 }
4299 *input = PyBytes_AS_STRING(inputobj);
4300 insize = PyBytes_GET_SIZE(inputobj);
4301 *inend = *input + insize;
4302 /* we can DECREF safely, as the exception has another reference,
4303 so the object won't go away. */
4304 Py_DECREF(inputobj);
4305
4306 if (newpos<0)
4307 newpos = insize+newpos;
4308 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004309 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 goto onError;
4311 }
4312
4313 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4314 if (repwstr == NULL)
4315 goto onError;
4316 /* need more space? (at least enough for what we
4317 have+the replacement+the rest of the string (starting
4318 at the new input position), so we won't have to check space
4319 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004320 requiredsize = *outpos;
4321 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4322 goto overflow;
4323 requiredsize += repwlen;
4324 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4325 goto overflow;
4326 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004328 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004329 requiredsize = 2*outsize;
4330 if (unicode_resize(output, requiredsize) < 0)
4331 goto onError;
4332 }
4333 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4334 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335 *endinpos = newpos;
4336 *inptr = *input + newpos;
4337
4338 /* we made it! */
4339 Py_XDECREF(restuple);
4340 return 0;
4341
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004342 overflow:
4343 PyErr_SetString(PyExc_OverflowError,
4344 "decoded result is too long for a Python string");
4345
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 onError:
4347 Py_XDECREF(restuple);
4348 return -1;
4349}
Steve Dowercc16be82016-09-08 10:35:16 -07004350#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351
4352static int
4353unicode_decode_call_errorhandler_writer(
4354 const char *errors, PyObject **errorHandler,
4355 const char *encoding, const char *reason,
4356 const char **input, const char **inend, Py_ssize_t *startinpos,
4357 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4358 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4359{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004360 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004361
4362 PyObject *restuple = NULL;
4363 PyObject *repunicode = NULL;
4364 Py_ssize_t insize;
4365 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004366 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367 PyObject *inputobj = NULL;
4368
4369 if (*errorHandler == NULL) {
4370 *errorHandler = PyCodec_LookupError(errors);
4371 if (*errorHandler == NULL)
4372 goto onError;
4373 }
4374
4375 make_decode_exception(exceptionObject,
4376 encoding,
4377 *input, *inend - *input,
4378 *startinpos, *endinpos,
4379 reason);
4380 if (*exceptionObject == NULL)
4381 goto onError;
4382
4383 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4384 if (restuple == NULL)
4385 goto onError;
4386 if (!PyTuple_Check(restuple)) {
4387 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4388 goto onError;
4389 }
4390 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004391 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004392
4393 /* Copy back the bytes variables, which might have been modified by the
4394 callback */
4395 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4396 if (!inputobj)
4397 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004398 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004400 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004401 *input = PyBytes_AS_STRING(inputobj);
4402 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004403 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004404 /* we can DECREF safely, as the exception has another reference,
4405 so the object won't go away. */
4406 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004410 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004411 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004413 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414
Victor Stinner8f674cc2013-04-17 23:02:17 +02004415 if (PyUnicode_READY(repunicode) < 0)
4416 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004417 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004418 if (replen > 1) {
4419 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004420 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004421 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4422 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4423 goto onError;
4424 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004426 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 Py_XDECREF(restuple);
4433 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438}
4439
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004440/* --- UTF-7 Codec -------------------------------------------------------- */
4441
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442/* See RFC2152 for details. We encode conservatively and decode liberally. */
4443
4444/* Three simple macros defining base-64. */
4445
4446/* Is c a base-64 character? */
4447
4448#define IS_BASE64(c) \
4449 (((c) >= 'A' && (c) <= 'Z') || \
4450 ((c) >= 'a' && (c) <= 'z') || \
4451 ((c) >= '0' && (c) <= '9') || \
4452 (c) == '+' || (c) == '/')
4453
4454/* given that c is a base-64 character, what is its base-64 value? */
4455
4456#define FROM_BASE64(c) \
4457 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4458 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4459 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4460 (c) == '+' ? 62 : 63)
4461
4462/* What is the base-64 character of the bottom 6 bits of n? */
4463
4464#define TO_BASE64(n) \
4465 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4466
4467/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4468 * decoded as itself. We are permissive on decoding; the only ASCII
4469 * byte not decoding to itself is the + which begins a base64
4470 * string. */
4471
4472#define DECODE_DIRECT(c) \
4473 ((c) <= 127 && (c) != '+')
4474
4475/* The UTF-7 encoder treats ASCII characters differently according to
4476 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4477 * the above). See RFC2152. This array identifies these different
4478 * sets:
4479 * 0 : "Set D"
4480 * alphanumeric and '(),-./:?
4481 * 1 : "Set O"
4482 * !"#$%&*;<=>@[]^_`{|}
4483 * 2 : "whitespace"
4484 * ht nl cr sp
4485 * 3 : special (must be base64 encoded)
4486 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4487 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488
Tim Petersced69f82003-09-16 20:30:58 +00004489static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490char utf7_category[128] = {
4491/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4492 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4493/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4494 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4495/* sp ! " # $ % & ' ( ) * + , - . / */
4496 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4497/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4499/* @ A B C D E F G H I J K L M N O */
4500 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4501/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4503/* ` a b c d e f g h i j k l m n o */
4504 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4505/* p q r s t u v w x y z { | } ~ del */
4506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507};
4508
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509/* ENCODE_DIRECT: this character should be encoded as itself. The
4510 * answer depends on whether we are encoding set O as itself, and also
4511 * on whether we are encoding whitespace as itself. RFC2152 makes it
4512 * clear that the answers to these questions vary between
4513 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004514
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515#define ENCODE_DIRECT(c, directO, directWS) \
4516 ((c) < 128 && (c) > 0 && \
4517 ((utf7_category[(c)] == 0) || \
4518 (directWS && (utf7_category[(c)] == 2)) || \
4519 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520
Alexander Belopolsky40018472011-02-26 01:02:56 +00004521PyObject *
4522PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004523 Py_ssize_t size,
4524 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004525{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004526 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4527}
4528
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529/* The decoder. The only state we preserve is our read position,
4530 * i.e. how many characters we have consumed. So if we end in the
4531 * middle of a shift sequence we have to back off the read position
4532 * and the output to the beginning of the sequence, otherwise we lose
4533 * all the shift state (seen bits, number of bits seen, high
4534 * surrogate). */
4535
Alexander Belopolsky40018472011-02-26 01:02:56 +00004536PyObject *
4537PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004538 Py_ssize_t size,
4539 const char *errors,
4540 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004543 Py_ssize_t startinpos;
4544 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004545 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547 const char *errmsg = "";
4548 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004549 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550 unsigned int base64bits = 0;
4551 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004552 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 PyObject *errorHandler = NULL;
4554 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004556 if (size == 0) {
4557 if (consumed)
4558 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004559 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004560 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004562 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004563 _PyUnicodeWriter_Init(&writer);
4564 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004565
4566 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567 e = s + size;
4568
4569 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004570 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004572 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 if (inShift) { /* in a base-64 section */
4575 if (IS_BASE64(ch)) { /* consume a base-64 character */
4576 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4577 base64bits += 6;
4578 s++;
4579 if (base64bits >= 16) {
4580 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004581 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 base64bits -= 16;
4583 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004584 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 if (surrogate) {
4586 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004587 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4588 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004589 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004590 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004592 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 }
4594 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004595 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004596 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004598 }
4599 }
Victor Stinner551ac952011-11-29 22:58:13 +01004600 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 /* first surrogate */
4602 surrogate = outCh;
4603 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 }
4608 }
4609 }
4610 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 if (base64bits > 0) { /* left-over bits */
4613 if (base64bits >= 6) {
4614 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004615 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 errmsg = "partial character in shift sequence";
4617 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 else {
4620 /* Some bits remain; they should be zero */
4621 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004622 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 errmsg = "non-zero padding bits in shift sequence";
4624 goto utf7Error;
4625 }
4626 }
4627 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004628 if (surrogate && DECODE_DIRECT(ch)) {
4629 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4630 goto onError;
4631 }
4632 surrogate = 0;
4633 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 /* '-' is absorbed; other terminating
4635 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
4639 }
4640 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 s++; /* consume '+' */
4643 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004645 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004646 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 }
4648 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004650 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004651 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004653 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 }
4655 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004658 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004659 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 else {
4662 startinpos = s-starts;
4663 s++;
4664 errmsg = "unexpected special character";
4665 goto utf7Error;
4666 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 errors, &errorHandler,
4672 "utf7", errmsg,
4673 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004674 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676 }
4677
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 /* end of string */
4679
4680 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4681 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004682 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683 if (surrogate ||
4684 (base64bits >= 6) ||
4685 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004687 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 errors, &errorHandler,
4689 "utf7", "unterminated shift sequence",
4690 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004691 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 goto onError;
4693 if (s < e)
4694 goto restart;
4695 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697
4698 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004699 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004702 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004703 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004704 writer.kind, writer.data, shiftOutStart);
4705 Py_XDECREF(errorHandler);
4706 Py_XDECREF(exc);
4707 _PyUnicodeWriter_Dealloc(&writer);
4708 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004709 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004710 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 }
4712 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004713 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 Py_XDECREF(errorHandler);
4718 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 Py_XDECREF(errorHandler);
4723 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004724 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725 return NULL;
4726}
4727
4728
Alexander Belopolsky40018472011-02-26 01:02:56 +00004729PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004730_PyUnicode_EncodeUTF7(PyObject *str,
4731 int base64SetO,
4732 int base64WhiteSpace,
4733 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004734{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004735 int kind;
4736 void *data;
4737 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004738 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 unsigned int base64bits = 0;
4742 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004743 char * out;
4744 char * start;
4745
Benjamin Petersonbac79492012-01-14 13:34:47 -05004746 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004747 return NULL;
4748 kind = PyUnicode_KIND(str);
4749 data = PyUnicode_DATA(str);
4750 len = PyUnicode_GET_LENGTH(str);
4751
4752 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004756 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004757 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004758 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 if (v == NULL)
4760 return NULL;
4761
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004762 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004763 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004764 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 if (inShift) {
4767 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4768 /* shifting out */
4769 if (base64bits) { /* output remaining bits */
4770 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4771 base64buffer = 0;
4772 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773 }
4774 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 /* Characters not in the BASE64 set implicitly unshift the sequence
4776 so no '-' is required, except if the character is itself a '-' */
4777 if (IS_BASE64(ch) || ch == '-') {
4778 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004779 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004780 *out++ = (char) ch;
4781 }
4782 else {
4783 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004784 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 else { /* not in a shift sequence */
4787 if (ch == '+') {
4788 *out++ = '+';
4789 *out++ = '-';
4790 }
4791 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4792 *out++ = (char) ch;
4793 }
4794 else {
4795 *out++ = '+';
4796 inShift = 1;
4797 goto encode_char;
4798 }
4799 }
4800 continue;
4801encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004803 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004804
Antoine Pitrou244651a2009-05-04 18:56:13 +00004805 /* code first surrogate */
4806 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004807 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 while (base64bits >= 6) {
4809 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4810 base64bits -= 6;
4811 }
4812 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004813 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 base64bits += 16;
4816 base64buffer = (base64buffer << 16) | ch;
4817 while (base64bits >= 6) {
4818 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4819 base64bits -= 6;
4820 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004821 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 if (base64bits)
4823 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4824 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004825 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004826 if (_PyBytes_Resize(&v, out - start) < 0)
4827 return NULL;
4828 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004829}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004830PyObject *
4831PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4832 Py_ssize_t size,
4833 int base64SetO,
4834 int base64WhiteSpace,
4835 const char *errors)
4836{
4837 PyObject *result;
4838 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4839 if (tmp == NULL)
4840 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004841 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004842 base64WhiteSpace, errors);
4843 Py_DECREF(tmp);
4844 return result;
4845}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004846
Antoine Pitrou244651a2009-05-04 18:56:13 +00004847#undef IS_BASE64
4848#undef FROM_BASE64
4849#undef TO_BASE64
4850#undef DECODE_DIRECT
4851#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004852
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853/* --- UTF-8 Codec -------------------------------------------------------- */
4854
Alexander Belopolsky40018472011-02-26 01:02:56 +00004855PyObject *
4856PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004857 Py_ssize_t size,
4858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Walter Dörwald69652032004-09-07 20:24:22 +00004860 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4861}
4862
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863#include "stringlib/asciilib.h"
4864#include "stringlib/codecs.h"
4865#include "stringlib/undef.h"
4866
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004867#include "stringlib/ucs1lib.h"
4868#include "stringlib/codecs.h"
4869#include "stringlib/undef.h"
4870
4871#include "stringlib/ucs2lib.h"
4872#include "stringlib/codecs.h"
4873#include "stringlib/undef.h"
4874
4875#include "stringlib/ucs4lib.h"
4876#include "stringlib/codecs.h"
4877#include "stringlib/undef.h"
4878
Antoine Pitrouab868312009-01-10 15:40:25 +00004879/* Mask to quickly check whether a C 'long' contains a
4880 non-ASCII, UTF8-encoded char. */
4881#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004882# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004883#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004884# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004885#else
4886# error C 'long' size should be either 4 or 8!
4887#endif
4888
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889static Py_ssize_t
4890ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004893 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004895 /*
4896 * Issue #17237: m68k is a bit different from most architectures in
4897 * that objects do not use "natural alignment" - for example, int and
4898 * long are only aligned at 2-byte boundaries. Therefore the assert()
4899 * won't work; also, tests have shown that skipping the "optimised
4900 * version" will even speed up m68k.
4901 */
4902#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004904 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4905 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004906 /* Fast path, see in STRINGLIB(utf8_decode) for
4907 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004908 /* Help allocation */
4909 const char *_p = p;
4910 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 while (_p < aligned_end) {
4912 unsigned long value = *(const unsigned long *) _p;
4913 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915 *((unsigned long *)q) = value;
4916 _p += SIZEOF_LONG;
4917 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004918 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 p = _p;
4920 while (p < end) {
4921 if ((unsigned char)*p & 0x80)
4922 break;
4923 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004928#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929 while (p < end) {
4930 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4931 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004932 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004933 /* Help allocation */
4934 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935 while (_p < aligned_end) {
4936 unsigned long value = *(unsigned long *) _p;
4937 if (value & ASCII_CHAR_MASK)
4938 break;
4939 _p += SIZEOF_LONG;
4940 }
4941 p = _p;
4942 if (_p == end)
4943 break;
4944 }
4945 if ((unsigned char)*p & 0x80)
4946 break;
4947 ++p;
4948 }
4949 memcpy(dest, start, p - start);
4950 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951}
Antoine Pitrouab868312009-01-10 15:40:25 +00004952
Victor Stinner785938e2011-12-11 20:09:03 +01004953PyObject *
4954PyUnicode_DecodeUTF8Stateful(const char *s,
4955 Py_ssize_t size,
4956 const char *errors,
4957 Py_ssize_t *consumed)
4958{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004959 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004960 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962
4963 Py_ssize_t startinpos;
4964 Py_ssize_t endinpos;
4965 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004966 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004967 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004968 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004969
4970 if (size == 0) {
4971 if (consumed)
4972 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004973 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004974 }
4975
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004976 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4977 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004978 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 *consumed = 1;
4980 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004981 }
4982
Victor Stinner8f674cc2013-04-17 23:02:17 +02004983 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004984 writer.min_length = size;
4985 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004986 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004987
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 writer.pos = ascii_decode(s, end, writer.data);
4989 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 while (s < end) {
4991 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004992 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004993
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 if (PyUnicode_IS_ASCII(writer.buffer))
4996 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004998 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 } else {
5002 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 }
5005
5006 switch (ch) {
5007 case 0:
5008 if (s == end || consumed)
5009 goto End;
5010 errmsg = "unexpected end of data";
5011 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005012 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 break;
5014 case 1:
5015 errmsg = "invalid start byte";
5016 startinpos = s - starts;
5017 endinpos = startinpos + 1;
5018 break;
5019 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005020 case 3:
5021 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 errmsg = "invalid continuation byte";
5023 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005024 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 break;
5026 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005027 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 goto onError;
5029 continue;
5030 }
5031
Victor Stinner1d65d912015-10-05 13:43:50 +02005032 if (error_handler == _Py_ERROR_UNKNOWN)
5033 error_handler = get_error_handler(errors);
5034
5035 switch (error_handler) {
5036 case _Py_ERROR_IGNORE:
5037 s += (endinpos - startinpos);
5038 break;
5039
5040 case _Py_ERROR_REPLACE:
5041 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5042 goto onError;
5043 s += (endinpos - startinpos);
5044 break;
5045
5046 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005047 {
5048 Py_ssize_t i;
5049
Victor Stinner1d65d912015-10-05 13:43:50 +02005050 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5051 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005052 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005053 ch = (Py_UCS4)(unsigned char)(starts[i]);
5054 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5055 ch + 0xdc00);
5056 writer.pos++;
5057 }
5058 s += (endinpos - startinpos);
5059 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005060 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005061
5062 default:
5063 if (unicode_decode_call_errorhandler_writer(
5064 errors, &error_handler_obj,
5065 "utf-8", errmsg,
5066 &starts, &end, &startinpos, &endinpos, &exc, &s,
5067 &writer))
5068 goto onError;
5069 }
Victor Stinner785938e2011-12-11 20:09:03 +01005070 }
5071
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 if (consumed)
5074 *consumed = s - starts;
5075
Victor Stinner1d65d912015-10-05 13:43:50 +02005076 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005078 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079
5080onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005081 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005085}
5086
Xavier de Gaye76febd02016-12-15 20:59:58 +01005087#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088
5089/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005090 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005091
5092 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005093 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005094
5095wchar_t*
5096_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5097{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 wchar_t *unicode;
5100 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005101
5102 /* Note: size will always be longer than the resulting Unicode
5103 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005104 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005106 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005107 if (!unicode)
5108 return NULL;
5109
5110 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005111 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005112 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005118 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 if (ch > 0xFF) {
5121#if SIZEOF_WCHAR_T == 4
5122 assert(0);
5123#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005124 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125 /* compute and append the two surrogates: */
5126 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5127 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5128#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 else {
5131 if (!ch && s == e)
5132 break;
5133 /* surrogateescape */
5134 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5135 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138 return unicode;
5139}
5140
Xavier de Gaye76febd02016-12-15 20:59:58 +01005141#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005143/* Primary internal function which creates utf8 encoded bytes objects.
5144
5145 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005146 and allocate exactly as much space needed at the end. Else allocate the
5147 maximum possible needed (4 result bytes per Unicode character), and return
5148 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005149*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005150PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005151_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152{
Victor Stinner6099a032011-12-18 14:22:26 +01005153 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154 void *data;
5155 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005157 if (!PyUnicode_Check(unicode)) {
5158 PyErr_BadArgument();
5159 return NULL;
5160 }
5161
5162 if (PyUnicode_READY(unicode) == -1)
5163 return NULL;
5164
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005165 if (PyUnicode_UTF8(unicode))
5166 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5167 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168
5169 kind = PyUnicode_KIND(unicode);
5170 data = PyUnicode_DATA(unicode);
5171 size = PyUnicode_GET_LENGTH(unicode);
5172
Benjamin Petersonead6b532011-12-20 17:23:42 -06005173 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005174 default:
5175 assert(0);
5176 case PyUnicode_1BYTE_KIND:
5177 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5178 assert(!PyUnicode_IS_ASCII(unicode));
5179 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5180 case PyUnicode_2BYTE_KIND:
5181 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5182 case PyUnicode_4BYTE_KIND:
5183 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185}
5186
Alexander Belopolsky40018472011-02-26 01:02:56 +00005187PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005188PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5189 Py_ssize_t size,
5190 const char *errors)
5191{
5192 PyObject *v, *unicode;
5193
5194 unicode = PyUnicode_FromUnicode(s, size);
5195 if (unicode == NULL)
5196 return NULL;
5197 v = _PyUnicode_AsUTF8String(unicode, errors);
5198 Py_DECREF(unicode);
5199 return v;
5200}
5201
5202PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005203PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005205 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206}
5207
Walter Dörwald41980ca2007-08-16 21:55:45 +00005208/* --- UTF-32 Codec ------------------------------------------------------- */
5209
5210PyObject *
5211PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 Py_ssize_t size,
5213 const char *errors,
5214 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215{
5216 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5217}
5218
5219PyObject *
5220PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 Py_ssize_t size,
5222 const char *errors,
5223 int *byteorder,
5224 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
5226 const char *starts = s;
5227 Py_ssize_t startinpos;
5228 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005229 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005230 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005231 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005232 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234 PyObject *errorHandler = NULL;
5235 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005236
Walter Dörwald41980ca2007-08-16 21:55:45 +00005237 q = (unsigned char *)s;
5238 e = q + size;
5239
5240 if (byteorder)
5241 bo = *byteorder;
5242
5243 /* Check for BOM marks (U+FEFF) in the input and adjust current
5244 byte order setting accordingly. In native mode, the leading BOM
5245 mark is skipped, in all other modes, it is copied to the output
5246 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005247 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005248 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 if (bom == 0x0000FEFF) {
5250 bo = -1;
5251 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005253 else if (bom == 0xFFFE0000) {
5254 bo = 1;
5255 q += 4;
5256 }
5257 if (byteorder)
5258 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259 }
5260
Victor Stinnere64322e2012-10-30 23:12:47 +01005261 if (q == e) {
5262 if (consumed)
5263 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005264 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005265 }
5266
Victor Stinnere64322e2012-10-30 23:12:47 +01005267#ifdef WORDS_BIGENDIAN
5268 le = bo < 0;
5269#else
5270 le = bo <= 0;
5271#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005272 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005273
Victor Stinner8f674cc2013-04-17 23:02:17 +02005274 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005275 writer.min_length = (e - q + 3) / 4;
5276 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005277 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 while (1) {
5280 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005281 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005282
Victor Stinnere64322e2012-10-30 23:12:47 +01005283 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005284 enum PyUnicode_Kind kind = writer.kind;
5285 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005286 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005287 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005288 if (le) {
5289 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005290 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 if (ch > maxch)
5292 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005293 if (kind != PyUnicode_1BYTE_KIND &&
5294 Py_UNICODE_IS_SURROGATE(ch))
5295 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 q += 4;
5298 } while (q <= last);
5299 }
5300 else {
5301 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005302 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 if (ch > maxch)
5304 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005305 if (kind != PyUnicode_1BYTE_KIND &&
5306 Py_UNICODE_IS_SURROGATE(ch))
5307 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 q += 4;
5310 } while (q <= last);
5311 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005313 }
5314
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005316 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005317 startinpos = ((const char *)q) - starts;
5318 endinpos = startinpos + 4;
5319 }
5320 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005323 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005325 startinpos = ((const char *)q) - starts;
5326 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005328 else {
5329 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005330 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 goto onError;
5332 q += 4;
5333 continue;
5334 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005335 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 startinpos = ((const char *)q) - starts;
5337 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005339
5340 /* The remaining input chars are ignored if the callback
5341 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005342 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005344 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005346 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005348 }
5349
Walter Dörwald41980ca2007-08-16 21:55:45 +00005350 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353 Py_XDECREF(errorHandler);
5354 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005355 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005356
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 Py_XDECREF(errorHandler);
5360 Py_XDECREF(exc);
5361 return NULL;
5362}
5363
5364PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005365_PyUnicode_EncodeUTF32(PyObject *str,
5366 const char *errors,
5367 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005369 enum PyUnicode_Kind kind;
5370 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005371 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005372 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005373 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005374#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005375 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005376#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005377 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005379 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005381 PyObject *errorHandler = NULL;
5382 PyObject *exc = NULL;
5383 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005384
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005385 if (!PyUnicode_Check(str)) {
5386 PyErr_BadArgument();
5387 return NULL;
5388 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005389 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005390 return NULL;
5391 kind = PyUnicode_KIND(str);
5392 data = PyUnicode_DATA(str);
5393 len = PyUnicode_GET_LENGTH(str);
5394
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005396 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005398 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399 if (v == NULL)
5400 return NULL;
5401
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005402 /* output buffer is 4-bytes aligned */
5403 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005404 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005407 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005408 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005411 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005413 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005414 else
5415 encoding = "utf-32";
5416
5417 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5419 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420 }
5421
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 pos = 0;
5423 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425
5426 if (kind == PyUnicode_2BYTE_KIND) {
5427 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5428 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 else {
5431 assert(kind == PyUnicode_4BYTE_KIND);
5432 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5433 &out, native_ordering);
5434 }
5435 if (pos == len)
5436 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005437
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 rep = unicode_encode_call_errorhandler(
5439 errors, &errorHandler,
5440 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005441 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005442 if (!rep)
5443 goto error;
5444
5445 if (PyBytes_Check(rep)) {
5446 repsize = PyBytes_GET_SIZE(rep);
5447 if (repsize & 3) {
5448 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 "surrogates not allowed");
5451 goto error;
5452 }
5453 moreunits = repsize / 4;
5454 }
5455 else {
5456 assert(PyUnicode_Check(rep));
5457 if (PyUnicode_READY(rep) < 0)
5458 goto error;
5459 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5460 if (!PyUnicode_IS_ASCII(rep)) {
5461 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005462 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005463 "surrogates not allowed");
5464 goto error;
5465 }
5466 }
5467
5468 /* four bytes are reserved for each surrogate */
5469 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005470 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005471 Py_ssize_t morebytes = 4 * (moreunits - 1);
5472 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5473 /* integer overflow */
5474 PyErr_NoMemory();
5475 goto error;
5476 }
5477 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5478 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005479 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005480 }
5481
5482 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005483 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005484 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005486 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005487 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5488 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 }
5490
5491 Py_CLEAR(rep);
5492 }
5493
5494 /* Cut back to size actually needed. This is necessary for, for example,
5495 encoding of a string containing isolated surrogates and the 'ignore'
5496 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005497 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 if (nsize != PyBytes_GET_SIZE(v))
5499 _PyBytes_Resize(&v, nsize);
5500 Py_XDECREF(errorHandler);
5501 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005502 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005503 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005504 error:
5505 Py_XDECREF(rep);
5506 Py_XDECREF(errorHandler);
5507 Py_XDECREF(exc);
5508 Py_XDECREF(v);
5509 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005510}
5511
Alexander Belopolsky40018472011-02-26 01:02:56 +00005512PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005513PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5514 Py_ssize_t size,
5515 const char *errors,
5516 int byteorder)
5517{
5518 PyObject *result;
5519 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5520 if (tmp == NULL)
5521 return NULL;
5522 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5523 Py_DECREF(tmp);
5524 return result;
5525}
5526
5527PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005528PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005529{
Victor Stinnerb960b342011-11-20 19:12:52 +01005530 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005531}
5532
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533/* --- UTF-16 Codec ------------------------------------------------------- */
5534
Tim Peters772747b2001-08-09 22:21:55 +00005535PyObject *
5536PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 Py_ssize_t size,
5538 const char *errors,
5539 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540{
Walter Dörwald69652032004-09-07 20:24:22 +00005541 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5542}
5543
5544PyObject *
5545PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 Py_ssize_t size,
5547 const char *errors,
5548 int *byteorder,
5549 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005550{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t startinpos;
5553 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005554 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005555 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005556 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005557 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005558 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 PyObject *errorHandler = NULL;
5560 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005561 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Tim Peters772747b2001-08-09 22:21:55 +00005563 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005564 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
5566 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005567 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005569 /* Check for BOM marks (U+FEFF) in the input and adjust current
5570 byte order setting accordingly. In native mode, the leading BOM
5571 mark is skipped, in all other modes, it is copied to the output
5572 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573 if (bo == 0 && size >= 2) {
5574 const Py_UCS4 bom = (q[1] << 8) | q[0];
5575 if (bom == 0xFEFF) {
5576 q += 2;
5577 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005579 else if (bom == 0xFFFE) {
5580 q += 2;
5581 bo = 1;
5582 }
5583 if (byteorder)
5584 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587 if (q == e) {
5588 if (consumed)
5589 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005590 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005591 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005592
Christian Heimes743e0cd2012-10-17 23:52:17 +02005593#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005594 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005595 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005596#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005598 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005599#endif
Tim Peters772747b2001-08-09 22:21:55 +00005600
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601 /* Note: size will always be longer than the resulting Unicode
5602 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005603 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005604 writer.min_length = (e - q + 1) / 2;
5605 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 while (1) {
5609 Py_UCS4 ch = 0;
5610 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005613 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005614 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005615 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 native_ordering);
5617 else
5618 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 native_ordering);
5621 } else if (kind == PyUnicode_2BYTE_KIND) {
5622 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 native_ordering);
5625 } else {
5626 assert(kind == PyUnicode_4BYTE_KIND);
5627 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005631 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632
Antoine Pitrou63065d72012-05-15 23:48:04 +02005633 switch (ch)
5634 {
5635 case 0:
5636 /* remaining byte at the end? (size should be even) */
5637 if (q == e || consumed)
5638 goto End;
5639 errmsg = "truncated data";
5640 startinpos = ((const char *)q) - starts;
5641 endinpos = ((const char *)e) - starts;
5642 break;
5643 /* The remaining input chars are ignored if the callback
5644 chooses to skip the input */
5645 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005646 q -= 2;
5647 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005648 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005650 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 endinpos = ((const char *)e) - starts;
5652 break;
5653 case 2:
5654 errmsg = "illegal encoding";
5655 startinpos = ((const char *)q) - 2 - starts;
5656 endinpos = startinpos + 2;
5657 break;
5658 case 3:
5659 errmsg = "illegal UTF-16 surrogate";
5660 startinpos = ((const char *)q) - 4 - starts;
5661 endinpos = startinpos + 2;
5662 break;
5663 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005664 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005665 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 continue;
5667 }
5668
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005669 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005670 errors,
5671 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005672 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005673 &starts,
5674 (const char **)&e,
5675 &startinpos,
5676 &endinpos,
5677 &exc,
5678 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 }
5682
Antoine Pitrou63065d72012-05-15 23:48:04 +02005683End:
Walter Dörwald69652032004-09-07 20:24:22 +00005684 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005686
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 Py_XDECREF(errorHandler);
5688 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005692 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 Py_XDECREF(errorHandler);
5694 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 return NULL;
5696}
5697
Tim Peters772747b2001-08-09 22:21:55 +00005698PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005699_PyUnicode_EncodeUTF16(PyObject *str,
5700 const char *errors,
5701 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005703 enum PyUnicode_Kind kind;
5704 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005705 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005706 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005708 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005709#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005710 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005711#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005712 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005713#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005714 const char *encoding;
5715 Py_ssize_t nsize, pos;
5716 PyObject *errorHandler = NULL;
5717 PyObject *exc = NULL;
5718 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005719
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005720 if (!PyUnicode_Check(str)) {
5721 PyErr_BadArgument();
5722 return NULL;
5723 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005724 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005725 return NULL;
5726 kind = PyUnicode_KIND(str);
5727 data = PyUnicode_DATA(str);
5728 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005731 if (kind == PyUnicode_4BYTE_KIND) {
5732 const Py_UCS4 *in = (const Py_UCS4 *)data;
5733 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005734 while (in < end) {
5735 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005737 }
5738 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005739 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005740 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005742 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 nsize = len + pairs + (byteorder == 0);
5744 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005745 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005750 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005751 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005753 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 }
5755 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005756 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Tim Peters772747b2001-08-09 22:21:55 +00005758
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 if (kind == PyUnicode_1BYTE_KIND) {
5760 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5761 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005762 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005763
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005766 }
5767 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005769 }
5770 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773
5774 pos = 0;
5775 while (pos < len) {
5776 Py_ssize_t repsize, moreunits;
5777
5778 if (kind == PyUnicode_2BYTE_KIND) {
5779 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5780 &out, native_ordering);
5781 }
5782 else {
5783 assert(kind == PyUnicode_4BYTE_KIND);
5784 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5785 &out, native_ordering);
5786 }
5787 if (pos == len)
5788 break;
5789
5790 rep = unicode_encode_call_errorhandler(
5791 errors, &errorHandler,
5792 encoding, "surrogates not allowed",
5793 str, &exc, pos, pos + 1, &pos);
5794 if (!rep)
5795 goto error;
5796
5797 if (PyBytes_Check(rep)) {
5798 repsize = PyBytes_GET_SIZE(rep);
5799 if (repsize & 1) {
5800 raise_encode_exception(&exc, encoding,
5801 str, pos - 1, pos,
5802 "surrogates not allowed");
5803 goto error;
5804 }
5805 moreunits = repsize / 2;
5806 }
5807 else {
5808 assert(PyUnicode_Check(rep));
5809 if (PyUnicode_READY(rep) < 0)
5810 goto error;
5811 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5812 if (!PyUnicode_IS_ASCII(rep)) {
5813 raise_encode_exception(&exc, encoding,
5814 str, pos - 1, pos,
5815 "surrogates not allowed");
5816 goto error;
5817 }
5818 }
5819
5820 /* two bytes are reserved for each surrogate */
5821 if (moreunits > 1) {
5822 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5823 Py_ssize_t morebytes = 2 * (moreunits - 1);
5824 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5825 /* integer overflow */
5826 PyErr_NoMemory();
5827 goto error;
5828 }
5829 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5830 goto error;
5831 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5832 }
5833
5834 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005835 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 out += moreunits;
5837 } else /* rep is unicode */ {
5838 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5839 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5840 &out, native_ordering);
5841 }
5842
5843 Py_CLEAR(rep);
5844 }
5845
5846 /* Cut back to size actually needed. This is necessary for, for example,
5847 encoding of a string containing isolated surrogates and the 'ignore' handler
5848 is used. */
5849 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5850 if (nsize != PyBytes_GET_SIZE(v))
5851 _PyBytes_Resize(&v, nsize);
5852 Py_XDECREF(errorHandler);
5853 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005854 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005855 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005856 error:
5857 Py_XDECREF(rep);
5858 Py_XDECREF(errorHandler);
5859 Py_XDECREF(exc);
5860 Py_XDECREF(v);
5861 return NULL;
5862#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863}
5864
Alexander Belopolsky40018472011-02-26 01:02:56 +00005865PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005866PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5867 Py_ssize_t size,
5868 const char *errors,
5869 int byteorder)
5870{
5871 PyObject *result;
5872 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5873 if (tmp == NULL)
5874 return NULL;
5875 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5876 Py_DECREF(tmp);
5877 return result;
5878}
5879
5880PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005881PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
5886/* --- Unicode Escape Codec ----------------------------------------------- */
5887
Fredrik Lundh06d12682001-01-24 07:59:11 +00005888static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005889
Alexander Belopolsky40018472011-02-26 01:02:56 +00005890PyObject *
Eric V. Smith56466482016-10-31 14:46:26 -04005891_PyUnicode_DecodeUnicodeEscape(const char *s,
5892 Py_ssize_t size,
5893 const char *errors,
5894 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005897 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 PyObject *errorHandler = NULL;
5900 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005901
Eric V. Smith56466482016-10-31 14:46:26 -04005902 // so we can remember if we've seen an invalid escape char or not
5903 *first_invalid_escape = NULL;
5904
Victor Stinner62ec3312016-09-06 17:04:34 -07005905 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005906 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005907 }
5908 /* Escaped strings will always be longer than the resulting
5909 Unicode string, so we start with size here and then reduce the
5910 length after conversion to the true value.
5911 (but if the error callback returns a long replacement string
5912 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005913 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005914 writer.min_length = size;
5915 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5916 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005917 }
5918
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 end = s + size;
5920 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005921 unsigned char c = (unsigned char) *s++;
5922 Py_UCS4 ch;
5923 int count;
5924 Py_ssize_t startinpos;
5925 Py_ssize_t endinpos;
5926 const char *message;
5927
5928#define WRITE_ASCII_CHAR(ch) \
5929 do { \
5930 assert(ch <= 127); \
5931 assert(writer.pos < writer.size); \
5932 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5933 } while(0)
5934
5935#define WRITE_CHAR(ch) \
5936 do { \
5937 if (ch <= writer.maxchar) { \
5938 assert(writer.pos < writer.size); \
5939 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5940 } \
5941 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5942 goto onError; \
5943 } \
5944 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
5946 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005947 if (c != '\\') {
5948 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 continue;
5950 }
5951
Victor Stinner62ec3312016-09-06 17:04:34 -07005952 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005954 if (s >= end) {
5955 message = "\\ at end of string";
5956 goto error;
5957 }
5958 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005959
Victor Stinner62ec3312016-09-06 17:04:34 -07005960 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005961 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 case '\n': continue;
5965 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5966 case '\'': WRITE_ASCII_CHAR('\''); continue;
5967 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5968 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5971 case 't': WRITE_ASCII_CHAR('\t'); continue;
5972 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5973 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005974 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005976 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 case '0': case '1': case '2': case '3':
5981 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005982 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005983 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 ch = (ch<<3) + *s++ - '0';
5985 if (s < end && '0' <= *s && *s <= '7') {
5986 ch = (ch<<3) + *s++ - '0';
5987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005989 WRITE_CHAR(ch);
5990 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* hex escapes */
5993 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005995 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005996 message = "truncated \\xXX escape";
5997 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006002 message = "truncated \\uXXXX escape";
6003 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006006 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006007 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006008 message = "truncated \\UXXXXXXXX escape";
6009 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006010 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006011 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006012 ch <<= 4;
6013 if (c >= '0' && c <= '9') {
6014 ch += c - '0';
6015 }
6016 else if (c >= 'a' && c <= 'f') {
6017 ch += c - ('a' - 10);
6018 }
6019 else if (c >= 'A' && c <= 'F') {
6020 ch += c - ('A' - 10);
6021 }
6022 else {
6023 break;
6024 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006025 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006027 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 }
6029
6030 /* when we get here, ch is a 32-bit unicode character */
6031 if (ch > MAX_UNICODE) {
6032 message = "illegal Unicode character";
6033 goto error;
6034 }
6035
6036 WRITE_CHAR(ch);
6037 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006038
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 if (ucnhash_CAPI == NULL) {
6042 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006043 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6044 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 if (ucnhash_CAPI == NULL) {
6046 PyErr_SetString(
6047 PyExc_UnicodeError,
6048 "\\N escapes not supported (can't load unicodedata module)"
6049 );
6050 goto onError;
6051 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006052 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006053
6054 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006056 const char *start = ++s;
6057 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006061 namelen = s - start;
6062 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006064 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 ch = 0xffffffff; /* in case 'getcode' messes up */
6066 if (namelen <= INT_MAX &&
6067 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6068 &ch, 0)) {
6069 assert(ch <= MAX_UNICODE);
6070 WRITE_CHAR(ch);
6071 continue;
6072 }
6073 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 }
6075 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006076 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077
6078 default:
Eric V. Smith56466482016-10-31 14:46:26 -04006079 if (*first_invalid_escape == NULL) {
6080 *first_invalid_escape = s-1; /* Back up one char, since we've
6081 already incremented s. */
6082 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006083 WRITE_ASCII_CHAR('\\');
6084 WRITE_CHAR(c);
6085 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006087
6088 error:
6089 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006090 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006091 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006092 errors, &errorHandler,
6093 "unicodeescape", message,
6094 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006095 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006096 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006097 }
6098 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6099 goto onError;
6100 }
6101
6102#undef WRITE_ASCII_CHAR
6103#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006105
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006106 Py_XDECREF(errorHandler);
6107 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006108 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006109
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006111 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 return NULL;
6115}
6116
Eric V. Smith56466482016-10-31 14:46:26 -04006117PyObject *
6118PyUnicode_DecodeUnicodeEscape(const char *s,
6119 Py_ssize_t size,
6120 const char *errors)
6121{
6122 const char *first_invalid_escape;
6123 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6124 &first_invalid_escape);
6125 if (result == NULL)
6126 return NULL;
6127 if (first_invalid_escape != NULL) {
6128 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6129 "invalid escape sequence '\\%c'",
6130 *first_invalid_escape) < 0) {
6131 Py_DECREF(result);
6132 return NULL;
6133 }
6134 }
6135 return result;
6136}
6137
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006138/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006146 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006148 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Ezio Melottie7f90372012-10-05 03:33:31 +03006150 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006151 escape.
6152
Ezio Melottie7f90372012-10-05 03:33:31 +03006153 For UCS1 strings it's '\xxx', 4 bytes per source character.
6154 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6155 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006156 */
6157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (!PyUnicode_Check(unicode)) {
6159 PyErr_BadArgument();
6160 return NULL;
6161 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006164 }
Victor Stinner358af132015-10-12 22:36:57 +02006165
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006167 if (len == 0) {
6168 return PyBytes_FromStringAndSize(NULL, 0);
6169 }
6170
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 kind = PyUnicode_KIND(unicode);
6172 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6174 bytes, and 1 byte characters 4. */
6175 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006176 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 return PyErr_NoMemory();
6178 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006179 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 if (repr == NULL) {
6181 return NULL;
6182 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183
Victor Stinner62ec3312016-09-06 17:04:34 -07006184 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006186 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006187
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 /* U+0000-U+00ff range */
6189 if (ch < 0x100) {
6190 if (ch >= ' ' && ch < 127) {
6191 if (ch != '\\') {
6192 /* Copy printable US ASCII as-is */
6193 *p++ = (char) ch;
6194 }
6195 /* Escape backslashes */
6196 else {
6197 *p++ = '\\';
6198 *p++ = '\\';
6199 }
6200 }
Victor Stinner358af132015-10-12 22:36:57 +02006201
Victor Stinner62ec3312016-09-06 17:04:34 -07006202 /* Map special whitespace to '\t', \n', '\r' */
6203 else if (ch == '\t') {
6204 *p++ = '\\';
6205 *p++ = 't';
6206 }
6207 else if (ch == '\n') {
6208 *p++ = '\\';
6209 *p++ = 'n';
6210 }
6211 else if (ch == '\r') {
6212 *p++ = '\\';
6213 *p++ = 'r';
6214 }
6215
6216 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6217 else {
6218 *p++ = '\\';
6219 *p++ = 'x';
6220 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6221 *p++ = Py_hexdigits[ch & 0x000F];
6222 }
Tim Petersced69f82003-09-16 20:30:58 +00006223 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006224 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006225 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 *p++ = '\\';
6227 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006228 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6229 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6230 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6231 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006233 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6234 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006235
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 /* Make sure that the first two digits are zero */
6237 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006238 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 *p++ = 'U';
6240 *p++ = '0';
6241 *p++ = '0';
6242 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6243 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6244 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6245 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6246 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6247 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 assert(p - PyBytes_AS_STRING(repr) > 0);
6252 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6253 return NULL;
6254 }
6255 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256}
6257
Alexander Belopolsky40018472011-02-26 01:02:56 +00006258PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006259PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6260 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006262 PyObject *result;
6263 PyObject *tmp = PyUnicode_FromUnicode(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 }
6267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268 result = PyUnicode_AsUnicodeEscapeString(tmp);
6269 Py_DECREF(tmp);
6270 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
6273/* --- Raw Unicode Escape Codec ------------------------------------------- */
6274
Alexander Belopolsky40018472011-02-26 01:02:56 +00006275PyObject *
6276PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006277 Py_ssize_t size,
6278 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006280 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006281 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 PyObject *errorHandler = NULL;
6284 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006285
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006287 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006288 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 /* Escaped strings will always be longer than the resulting
6291 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 length after conversion to the true value. (But decoding error
6293 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006294 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 writer.min_length = size;
6296 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6297 goto onError;
6298 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 end = s + size;
6301 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 unsigned char c = (unsigned char) *s++;
6303 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006304 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 Py_ssize_t startinpos;
6306 Py_ssize_t endinpos;
6307 const char *message;
6308
6309#define WRITE_CHAR(ch) \
6310 do { \
6311 if (ch <= writer.maxchar) { \
6312 assert(writer.pos < writer.size); \
6313 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6314 } \
6315 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6316 goto onError; \
6317 } \
6318 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 if (c != '\\' || s >= end) {
6322 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006325
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 c = (unsigned char) *s++;
6327 if (c == 'u') {
6328 count = 4;
6329 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 else if (c == 'U') {
6332 count = 8;
6333 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006334 }
6335 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 assert(writer.pos < writer.size);
6337 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6338 WRITE_CHAR(c);
6339 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006340 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 startinpos = s - starts - 2;
6342
6343 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6344 for (ch = 0; count && s < end; ++s, --count) {
6345 c = (unsigned char)*s;
6346 ch <<= 4;
6347 if (c >= '0' && c <= '9') {
6348 ch += c - '0';
6349 }
6350 else if (c >= 'a' && c <= 'f') {
6351 ch += c - ('a' - 10);
6352 }
6353 else if (c >= 'A' && c <= 'F') {
6354 ch += c - ('A' - 10);
6355 }
6356 else {
6357 break;
6358 }
6359 }
6360 if (!count) {
6361 if (ch <= MAX_UNICODE) {
6362 WRITE_CHAR(ch);
6363 continue;
6364 }
6365 message = "\\Uxxxxxxxx out of range";
6366 }
6367
6368 endinpos = s-starts;
6369 writer.min_length = end - s + writer.pos;
6370 if (unicode_decode_call_errorhandler_writer(
6371 errors, &errorHandler,
6372 "rawunicodeescape", message,
6373 &starts, &end, &startinpos, &endinpos, &exc, &s,
6374 &writer)) {
6375 goto onError;
6376 }
6377 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6378 goto onError;
6379 }
6380
6381#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 Py_XDECREF(errorHandler);
6384 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006385 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006386
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006388 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389 Py_XDECREF(errorHandler);
6390 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006392
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393}
6394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006395
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006397PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402 int kind;
6403 void *data;
6404 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 if (!PyUnicode_Check(unicode)) {
6407 PyErr_BadArgument();
6408 return NULL;
6409 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006413 kind = PyUnicode_KIND(unicode);
6414 data = PyUnicode_DATA(unicode);
6415 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 if (kind == PyUnicode_1BYTE_KIND) {
6417 return PyBytes_FromStringAndSize(data, len);
6418 }
Victor Stinner0e368262011-11-10 20:12:49 +01006419
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6421 bytes, and 1 byte characters 4. */
6422 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 if (len > PY_SSIZE_T_MAX / expandsize) {
6425 return PyErr_NoMemory();
6426 }
6427 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6428 if (repr == NULL) {
6429 return NULL;
6430 }
6431 if (len == 0) {
6432 return repr;
6433 }
6434
6435 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006436 for (pos = 0; pos < len; pos++) {
6437 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006438
Victor Stinner62ec3312016-09-06 17:04:34 -07006439 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6440 if (ch < 0x100) {
6441 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006442 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6444 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 *p++ = '\\';
6446 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006447 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6449 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6450 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6453 else {
6454 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6455 *p++ = '\\';
6456 *p++ = 'U';
6457 *p++ = '0';
6458 *p++ = '0';
6459 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6464 *p++ = Py_hexdigits[ch & 15];
6465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006467
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 assert(p > PyBytes_AS_STRING(repr));
6469 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6470 return NULL;
6471 }
6472 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473}
6474
Alexander Belopolsky40018472011-02-26 01:02:56 +00006475PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6477 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 PyObject *result;
6480 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6481 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006482 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006483 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6484 Py_DECREF(tmp);
6485 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486}
6487
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006488/* --- Unicode Internal Codec ------------------------------------------- */
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
6491_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006492 Py_ssize_t size,
6493 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006494{
6495 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006496 Py_ssize_t startinpos;
6497 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006498 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006499 const char *end;
6500 const char *reason;
6501 PyObject *errorHandler = NULL;
6502 PyObject *exc = NULL;
6503
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006504 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006505 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006506 1))
6507 return NULL;
6508
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006509 if (size == 0)
6510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006511
Victor Stinner8f674cc2013-04-17 23:02:17 +02006512 _PyUnicodeWriter_Init(&writer);
6513 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6514 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006516 }
6517 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006518
Victor Stinner8f674cc2013-04-17 23:02:17 +02006519 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006520 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006521 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006522 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006523 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006524 endinpos = end-starts;
6525 reason = "truncated input";
6526 goto error;
6527 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006528 /* We copy the raw representation one byte at a time because the
6529 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006530 ((char *) &uch)[0] = s[0];
6531 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006532#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006533 ((char *) &uch)[2] = s[2];
6534 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006535#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006536 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006537#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006538 /* We have to sanity check the raw data, otherwise doom looms for
6539 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006540 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006541 endinpos = s - starts + Py_UNICODE_SIZE;
6542 reason = "illegal code point (> 0x10FFFF)";
6543 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006544 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006545#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006546 s += Py_UNICODE_SIZE;
6547#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006548 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006549 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 Py_UNICODE uch2;
6551 ((char *) &uch2)[0] = s[0];
6552 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006553 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554 {
Victor Stinner551ac952011-11-29 22:58:13 +01006555 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006556 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006557 }
6558 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559#endif
6560
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006561 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006563 continue;
6564
6565 error:
6566 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006567 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006568 errors, &errorHandler,
6569 "unicode_internal", reason,
6570 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006571 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006572 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006573 }
6574
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006575 Py_XDECREF(errorHandler);
6576 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006577 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006580 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006581 Py_XDECREF(errorHandler);
6582 Py_XDECREF(exc);
6583 return NULL;
6584}
6585
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586/* --- Latin-1 Codec ------------------------------------------------------ */
6587
Alexander Belopolsky40018472011-02-26 01:02:56 +00006588PyObject *
6589PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006590 Py_ssize_t size,
6591 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006594 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595}
6596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006598static void
6599make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006600 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006601 PyObject *unicode,
6602 Py_ssize_t startpos, Py_ssize_t endpos,
6603 const char *reason)
6604{
6605 if (*exceptionObject == NULL) {
6606 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006608 encoding, unicode, startpos, endpos, reason);
6609 }
6610 else {
6611 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6612 goto onError;
6613 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6614 goto onError;
6615 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6616 goto onError;
6617 return;
6618 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006619 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006620 }
6621}
6622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006623/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006624static void
6625raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006626 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006627 PyObject *unicode,
6628 Py_ssize_t startpos, Py_ssize_t endpos,
6629 const char *reason)
6630{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006631 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006632 encoding, unicode, startpos, endpos, reason);
6633 if (*exceptionObject != NULL)
6634 PyCodec_StrictErrors(*exceptionObject);
6635}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636
6637/* error handling callback helper:
6638 build arguments, call the callback and check the arguments,
6639 put the result into newpos and return the replacement string, which
6640 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641static PyObject *
6642unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 PyObject **errorHandler,
6644 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t startpos, Py_ssize_t endpos,
6647 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006649 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 PyObject *restuple;
6652 PyObject *resunicode;
6653
6654 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 }
6659
Benjamin Petersonbac79492012-01-14 13:34:47 -05006660 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 return NULL;
6662 len = PyUnicode_GET_LENGTH(unicode);
6663
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006664 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006665 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668
6669 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006674 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 Py_DECREF(restuple);
6676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006678 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 &resunicode, newpos)) {
6680 Py_DECREF(restuple);
6681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006683 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6684 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6685 Py_DECREF(restuple);
6686 return NULL;
6687 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689 *newpos = len + *newpos;
6690 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006691 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 Py_DECREF(restuple);
6693 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006694 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 Py_INCREF(resunicode);
6696 Py_DECREF(restuple);
6697 return resunicode;
6698}
6699
Alexander Belopolsky40018472011-02-26 01:02:56 +00006700static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006701unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006702 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006703 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006705 /* input state */
6706 Py_ssize_t pos=0, size;
6707 int kind;
6708 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 /* pointer into the output */
6710 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006711 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6712 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006713 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006715 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006716 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006717 /* output object */
6718 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719
Benjamin Petersonbac79492012-01-14 13:34:47 -05006720 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006721 return NULL;
6722 size = PyUnicode_GET_LENGTH(unicode);
6723 kind = PyUnicode_KIND(unicode);
6724 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 /* allocate enough for a simple encoding without
6726 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006727 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006728 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006729
6730 _PyBytesWriter_Init(&writer);
6731 str = _PyBytesWriter_Alloc(&writer, size);
6732 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006733 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006736 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006739 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006741 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006742 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006743 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006745 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006747 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006748 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006750
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006751 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006753
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006754 /* Only overallocate the buffer if it's not the last write */
6755 writer.overallocate = (collend < size);
6756
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006758 if (error_handler == _Py_ERROR_UNKNOWN)
6759 error_handler = get_error_handler(errors);
6760
6761 switch (error_handler) {
6762 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006763 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006765
6766 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006767 memset(str, '?', collend - collstart);
6768 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006769 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006770 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 break;
Victor Stinner50149202015-09-22 00:26:54 +02006773
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006774 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006775 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006776 writer.min_size -= (collend - collstart);
6777 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006778 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006779 if (str == NULL)
6780 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006781 pos = collend;
6782 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006783
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006784 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006785 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006786 writer.min_size -= (collend - collstart);
6787 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006788 unicode, collstart, collend);
6789 if (str == NULL)
6790 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006791 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 break;
Victor Stinner50149202015-09-22 00:26:54 +02006793
Victor Stinnerc3713e92015-09-29 12:32:13 +02006794 case _Py_ERROR_SURROGATEESCAPE:
6795 for (i = collstart; i < collend; ++i) {
6796 ch = PyUnicode_READ(kind, data, i);
6797 if (ch < 0xdc80 || 0xdcff < ch) {
6798 /* Not a UTF-8b surrogate */
6799 break;
6800 }
6801 *str++ = (char)(ch - 0xdc00);
6802 ++pos;
6803 }
6804 if (i >= collend)
6805 break;
6806 collstart = pos;
6807 assert(collstart != collend);
6808 /* fallback to general error handling */
6809
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006811 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6812 encoding, reason, unicode, &exc,
6813 collstart, collend, &newpos);
6814 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006816
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006817 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006818 writer.min_size -= 1;
6819
Victor Stinner6bd525b2015-10-09 13:10:05 +02006820 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006821 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006822 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006823 PyBytes_AS_STRING(rep),
6824 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006825 if (str == NULL)
6826 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006827 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 else {
6829 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006830
Victor Stinner6bd525b2015-10-09 13:10:05 +02006831 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833
6834 if (PyUnicode_IS_ASCII(rep)) {
6835 /* Fast path: all characters are smaller than limit */
6836 assert(limit >= 128);
6837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6838 str = _PyBytesWriter_WriteBytes(&writer, str,
6839 PyUnicode_DATA(rep),
6840 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842 else {
6843 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6844
6845 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6846 if (str == NULL)
6847 goto onError;
6848
6849 /* check if there is anything unencodable in the
6850 replacement and copy it to the output */
6851 for (i = 0; repsize-->0; ++i, ++str) {
6852 ch = PyUnicode_READ_CHAR(rep, i);
6853 if (ch >= limit) {
6854 raise_encode_exception(&exc, encoding, unicode,
6855 pos, pos+1, reason);
6856 goto onError;
6857 }
6858 *str = (char)ch;
6859 }
6860 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006862 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006863 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006864 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006865
6866 /* If overallocation was disabled, ensure that it was the last
6867 write. Otherwise, we missed an optimization */
6868 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006869 }
6870 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006871
Victor Stinner50149202015-09-22 00:26:54 +02006872 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006873 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006874 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006875
6876 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006877 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006878 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006879 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006880 Py_XDECREF(exc);
6881 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006882}
6883
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006884/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006885PyObject *
6886PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006887 Py_ssize_t size,
6888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 PyObject *result;
6891 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6892 if (unicode == NULL)
6893 return NULL;
6894 result = unicode_encode_ucs1(unicode, errors, 256);
6895 Py_DECREF(unicode);
6896 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897}
6898
Alexander Belopolsky40018472011-02-26 01:02:56 +00006899PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006900_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901{
6902 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 PyErr_BadArgument();
6904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006906 if (PyUnicode_READY(unicode) == -1)
6907 return NULL;
6908 /* Fast path: if it is a one-byte string, construct
6909 bytes object directly. */
6910 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6911 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6912 PyUnicode_GET_LENGTH(unicode));
6913 /* Non-Latin-1 characters present. Defer to above function to
6914 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006915 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006916}
6917
6918PyObject*
6919PyUnicode_AsLatin1String(PyObject *unicode)
6920{
6921 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922}
6923
6924/* --- 7-bit ASCII Codec -------------------------------------------------- */
6925
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926PyObject *
6927PyUnicode_DecodeASCII(const char *s,
6928 Py_ssize_t size,
6929 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006932 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006933 int kind;
6934 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006935 Py_ssize_t startinpos;
6936 Py_ssize_t endinpos;
6937 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006939 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006941 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006944 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006947 if (size == 1 && (unsigned char)s[0] < 128)
6948 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006949
Victor Stinner8f674cc2013-04-17 23:02:17 +02006950 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006951 writer.min_length = size;
6952 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006953 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006956 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006957 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006958 writer.pos = outpos;
6959 if (writer.pos == size)
6960 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006961
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006962 s += writer.pos;
6963 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006965 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006967 PyUnicode_WRITE(kind, data, writer.pos, c);
6968 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006972
6973 /* byte outsize range 0x00..0x7f: call the error handler */
6974
6975 if (error_handler == _Py_ERROR_UNKNOWN)
6976 error_handler = get_error_handler(errors);
6977
6978 switch (error_handler)
6979 {
6980 case _Py_ERROR_REPLACE:
6981 case _Py_ERROR_SURROGATEESCAPE:
6982 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006983 but we may switch to UCS2 at the first write */
6984 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6985 goto onError;
6986 kind = writer.kind;
6987 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006988
6989 if (error_handler == _Py_ERROR_REPLACE)
6990 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6991 else
6992 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6993 writer.pos++;
6994 ++s;
6995 break;
6996
6997 case _Py_ERROR_IGNORE:
6998 ++s;
6999 break;
7000
7001 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 startinpos = s-starts;
7003 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007005 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 "ascii", "ordinal not in range(128)",
7007 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007010 kind = writer.kind;
7011 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007014 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007015 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007016 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007017
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007019 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007020 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 return NULL;
7023}
7024
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007025/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007026PyObject *
7027PyUnicode_EncodeASCII(const Py_UNICODE *p,
7028 Py_ssize_t size,
7029 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007031 PyObject *result;
7032 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7033 if (unicode == NULL)
7034 return NULL;
7035 result = unicode_encode_ucs1(unicode, errors, 128);
7036 Py_DECREF(unicode);
7037 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Alexander Belopolsky40018472011-02-26 01:02:56 +00007040PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007041_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042{
7043 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 PyErr_BadArgument();
7045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007047 if (PyUnicode_READY(unicode) == -1)
7048 return NULL;
7049 /* Fast path: if it is an ASCII-only string, construct bytes object
7050 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007051 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007052 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7053 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007054 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055}
7056
7057PyObject *
7058PyUnicode_AsASCIIString(PyObject *unicode)
7059{
7060 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061}
7062
Steve Dowercc16be82016-09-08 10:35:16 -07007063#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007064
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007065/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007066
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007067#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068#define NEED_RETRY
7069#endif
7070
Victor Stinner3a50e702011-10-18 21:21:00 +02007071#ifndef WC_ERR_INVALID_CHARS
7072# define WC_ERR_INVALID_CHARS 0x0080
7073#endif
7074
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007075static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007076code_page_name(UINT code_page, PyObject **obj)
7077{
7078 *obj = NULL;
7079 if (code_page == CP_ACP)
7080 return "mbcs";
7081 if (code_page == CP_UTF7)
7082 return "CP_UTF7";
7083 if (code_page == CP_UTF8)
7084 return "CP_UTF8";
7085
7086 *obj = PyBytes_FromFormat("cp%u", code_page);
7087 if (*obj == NULL)
7088 return NULL;
7089 return PyBytes_AS_STRING(*obj);
7090}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091
Victor Stinner3a50e702011-10-18 21:21:00 +02007092static DWORD
7093decode_code_page_flags(UINT code_page)
7094{
7095 if (code_page == CP_UTF7) {
7096 /* The CP_UTF7 decoder only supports flags=0 */
7097 return 0;
7098 }
7099 else
7100 return MB_ERR_INVALID_CHARS;
7101}
7102
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007104 * Decode a byte string from a Windows code page into unicode object in strict
7105 * mode.
7106 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007107 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7108 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007111decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007112 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 const char *in,
7114 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115{
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007117 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119
7120 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 assert(insize > 0);
7122 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7123 if (outsize <= 0)
7124 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125
7126 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007128 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007129 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 if (*v == NULL)
7131 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133 }
7134 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007137 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007140 }
7141
7142 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7144 if (outsize <= 0)
7145 goto error;
7146 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007147
Victor Stinner3a50e702011-10-18 21:21:00 +02007148error:
7149 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7150 return -2;
7151 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007152 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153}
7154
Victor Stinner3a50e702011-10-18 21:21:00 +02007155/*
7156 * Decode a byte string from a code page into unicode object with an error
7157 * handler.
7158 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007159 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 * UnicodeDecodeError exception and returns -1 on error.
7161 */
7162static int
7163decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007164 PyObject **v,
7165 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007166 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007167{
7168 const char *startin = in;
7169 const char *endin = in + size;
7170 const DWORD flags = decode_code_page_flags(code_page);
7171 /* Ideally, we should get reason from FormatMessage. This is the Windows
7172 2000 English version of the message. */
7173 const char *reason = "No mapping for the Unicode character exists "
7174 "in the target code page.";
7175 /* each step cannot decode more than 1 character, but a character can be
7176 represented as a surrogate pair */
7177 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007178 int insize;
7179 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 PyObject *errorHandler = NULL;
7181 PyObject *exc = NULL;
7182 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007183 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 DWORD err;
7185 int ret = -1;
7186
7187 assert(size > 0);
7188
7189 encoding = code_page_name(code_page, &encoding_obj);
7190 if (encoding == NULL)
7191 return -1;
7192
Victor Stinner7d00cc12014-03-17 23:08:06 +01007193 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7195 UnicodeDecodeError. */
7196 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7197 if (exc != NULL) {
7198 PyCodec_StrictErrors(exc);
7199 Py_CLEAR(exc);
7200 }
7201 goto error;
7202 }
7203
7204 if (*v == NULL) {
7205 /* Create unicode object */
7206 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7207 PyErr_NoMemory();
7208 goto error;
7209 }
Victor Stinnerab595942011-12-17 04:59:06 +01007210 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007211 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 if (*v == NULL)
7213 goto error;
7214 startout = PyUnicode_AS_UNICODE(*v);
7215 }
7216 else {
7217 /* Extend unicode object */
7218 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7219 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7220 PyErr_NoMemory();
7221 goto error;
7222 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007223 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 goto error;
7225 startout = PyUnicode_AS_UNICODE(*v) + n;
7226 }
7227
7228 /* Decode the byte string character per character */
7229 out = startout;
7230 while (in < endin)
7231 {
7232 /* Decode a character */
7233 insize = 1;
7234 do
7235 {
7236 outsize = MultiByteToWideChar(code_page, flags,
7237 in, insize,
7238 buffer, Py_ARRAY_LENGTH(buffer));
7239 if (outsize > 0)
7240 break;
7241 err = GetLastError();
7242 if (err != ERROR_NO_UNICODE_TRANSLATION
7243 && err != ERROR_INSUFFICIENT_BUFFER)
7244 {
7245 PyErr_SetFromWindowsErr(0);
7246 goto error;
7247 }
7248 insize++;
7249 }
7250 /* 4=maximum length of a UTF-8 sequence */
7251 while (insize <= 4 && (in + insize) <= endin);
7252
7253 if (outsize <= 0) {
7254 Py_ssize_t startinpos, endinpos, outpos;
7255
Victor Stinner7d00cc12014-03-17 23:08:06 +01007256 /* last character in partial decode? */
7257 if (in + insize >= endin && !final)
7258 break;
7259
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 startinpos = in - startin;
7261 endinpos = startinpos + 1;
7262 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007263 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 errors, &errorHandler,
7265 encoding, reason,
7266 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007267 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 {
7269 goto error;
7270 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007271 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 }
7273 else {
7274 in += insize;
7275 memcpy(out, buffer, outsize * sizeof(wchar_t));
7276 out += outsize;
7277 }
7278 }
7279
7280 /* write a NUL character at the end */
7281 *out = 0;
7282
7283 /* Extend unicode object */
7284 outsize = out - startout;
7285 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007286 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007288 /* (in - startin) <= size and size is an int */
7289 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007290
7291error:
7292 Py_XDECREF(encoding_obj);
7293 Py_XDECREF(errorHandler);
7294 Py_XDECREF(exc);
7295 return ret;
7296}
7297
Victor Stinner3a50e702011-10-18 21:21:00 +02007298static PyObject *
7299decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 const char *s, Py_ssize_t size,
7301 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302{
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 PyObject *v = NULL;
7304 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 if (code_page < 0) {
7307 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7308 return NULL;
7309 }
7310
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313
Victor Stinner76a31a62011-11-04 00:05:13 +01007314 do
7315 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 if (size > INT_MAX) {
7318 chunk_size = INT_MAX;
7319 final = 0;
7320 done = 0;
7321 }
7322 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 {
7325 chunk_size = (int)size;
7326 final = (consumed == NULL);
7327 done = 1;
7328 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329
Victor Stinner76a31a62011-11-04 00:05:13 +01007330 if (chunk_size == 0 && done) {
7331 if (v != NULL)
7332 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007333 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007334 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007335
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 converted = decode_code_page_strict(code_page, &v,
7337 s, chunk_size);
7338 if (converted == -2)
7339 converted = decode_code_page_errors(code_page, &v,
7340 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007341 errors, final);
7342 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007343
7344 if (converted < 0) {
7345 Py_XDECREF(v);
7346 return NULL;
7347 }
7348
7349 if (consumed)
7350 *consumed += converted;
7351
7352 s += converted;
7353 size -= converted;
7354 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007355
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007356 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007357}
7358
Alexander Belopolsky40018472011-02-26 01:02:56 +00007359PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007360PyUnicode_DecodeCodePageStateful(int code_page,
7361 const char *s,
7362 Py_ssize_t size,
7363 const char *errors,
7364 Py_ssize_t *consumed)
7365{
7366 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7367}
7368
7369PyObject *
7370PyUnicode_DecodeMBCSStateful(const char *s,
7371 Py_ssize_t size,
7372 const char *errors,
7373 Py_ssize_t *consumed)
7374{
7375 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7376}
7377
7378PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007379PyUnicode_DecodeMBCS(const char *s,
7380 Py_ssize_t size,
7381 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007382{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007383 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7384}
7385
Victor Stinner3a50e702011-10-18 21:21:00 +02007386static DWORD
7387encode_code_page_flags(UINT code_page, const char *errors)
7388{
7389 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007390 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 }
7392 else if (code_page == CP_UTF7) {
7393 /* CP_UTF7 only supports flags=0 */
7394 return 0;
7395 }
7396 else {
7397 if (errors != NULL && strcmp(errors, "replace") == 0)
7398 return 0;
7399 else
7400 return WC_NO_BEST_FIT_CHARS;
7401 }
7402}
7403
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007404/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 * Encode a Unicode string to a Windows code page into a byte string in strict
7406 * mode.
7407 *
7408 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007409 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007410 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007411static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007412encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007413 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007415{
Victor Stinner554f3f02010-06-16 23:33:54 +00007416 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 BOOL *pusedDefaultChar = &usedDefaultChar;
7418 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007419 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 const DWORD flags = encode_code_page_flags(code_page, NULL);
7422 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 /* Create a substring so that we can get the UTF-16 representation
7424 of just the slice under consideration. */
7425 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007426
Martin v. Löwis3d325192011-11-04 18:23:06 +01007427 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007428
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007430 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007432 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007433
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 substring = PyUnicode_Substring(unicode, offset, offset+len);
7435 if (substring == NULL)
7436 return -1;
7437 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7438 if (p == NULL) {
7439 Py_DECREF(substring);
7440 return -1;
7441 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007442 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007443
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007444 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007446 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 NULL, 0,
7448 NULL, pusedDefaultChar);
7449 if (outsize <= 0)
7450 goto error;
7451 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 if (pusedDefaultChar && *pusedDefaultChar) {
7453 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007456
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 if (*outbytes == NULL) {
7461 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465 }
7466 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 const Py_ssize_t n = PyBytes_Size(*outbytes);
7469 if (outsize > PY_SSIZE_T_MAX - n) {
7470 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7475 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479 }
7480
7481 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007483 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 out, outsize,
7485 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007486 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 if (outsize <= 0)
7488 goto error;
7489 if (pusedDefaultChar && *pusedDefaultChar)
7490 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007491 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007492
Victor Stinner3a50e702011-10-18 21:21:00 +02007493error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7496 return -2;
7497 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007498 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007499}
7500
Victor Stinner3a50e702011-10-18 21:21:00 +02007501/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007502 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 * error handler.
7504 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007505 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 * -1 on other error.
7507 */
7508static int
7509encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007510 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007511 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007512{
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007514 Py_ssize_t pos = unicode_offset;
7515 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 /* Ideally, we should get reason from FormatMessage. This is the Windows
7517 2000 English version of the message. */
7518 const char *reason = "invalid character";
7519 /* 4=maximum length of a UTF-8 sequence */
7520 char buffer[4];
7521 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7522 Py_ssize_t outsize;
7523 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 PyObject *errorHandler = NULL;
7525 PyObject *exc = NULL;
7526 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007527 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007528 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 PyObject *rep;
7530 int ret = -1;
7531
7532 assert(insize > 0);
7533
7534 encoding = code_page_name(code_page, &encoding_obj);
7535 if (encoding == NULL)
7536 return -1;
7537
7538 if (errors == NULL || strcmp(errors, "strict") == 0) {
7539 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7540 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007541 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 if (exc != NULL) {
7543 PyCodec_StrictErrors(exc);
7544 Py_DECREF(exc);
7545 }
7546 Py_XDECREF(encoding_obj);
7547 return -1;
7548 }
7549
7550 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7551 pusedDefaultChar = &usedDefaultChar;
7552 else
7553 pusedDefaultChar = NULL;
7554
7555 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7556 PyErr_NoMemory();
7557 goto error;
7558 }
7559 outsize = insize * Py_ARRAY_LENGTH(buffer);
7560
7561 if (*outbytes == NULL) {
7562 /* Create string object */
7563 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7564 if (*outbytes == NULL)
7565 goto error;
7566 out = PyBytes_AS_STRING(*outbytes);
7567 }
7568 else {
7569 /* Extend string object */
7570 Py_ssize_t n = PyBytes_Size(*outbytes);
7571 if (n > PY_SSIZE_T_MAX - outsize) {
7572 PyErr_NoMemory();
7573 goto error;
7574 }
7575 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7576 goto error;
7577 out = PyBytes_AS_STRING(*outbytes) + n;
7578 }
7579
7580 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007581 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007583 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7584 wchar_t chars[2];
7585 int charsize;
7586 if (ch < 0x10000) {
7587 chars[0] = (wchar_t)ch;
7588 charsize = 1;
7589 }
7590 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007591 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7592 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007593 charsize = 2;
7594 }
7595
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007597 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 buffer, Py_ARRAY_LENGTH(buffer),
7599 NULL, pusedDefaultChar);
7600 if (outsize > 0) {
7601 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7602 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007603 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 memcpy(out, buffer, outsize);
7605 out += outsize;
7606 continue;
7607 }
7608 }
7609 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7610 PyErr_SetFromWindowsErr(0);
7611 goto error;
7612 }
7613
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 rep = unicode_encode_call_errorhandler(
7615 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007616 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007617 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 if (rep == NULL)
7619 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007621
7622 if (PyBytes_Check(rep)) {
7623 outsize = PyBytes_GET_SIZE(rep);
7624 if (outsize != 1) {
7625 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7626 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7627 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7628 Py_DECREF(rep);
7629 goto error;
7630 }
7631 out = PyBytes_AS_STRING(*outbytes) + offset;
7632 }
7633 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7634 out += outsize;
7635 }
7636 else {
7637 Py_ssize_t i;
7638 enum PyUnicode_Kind kind;
7639 void *data;
7640
Benjamin Petersonbac79492012-01-14 13:34:47 -05007641 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 Py_DECREF(rep);
7643 goto error;
7644 }
7645
7646 outsize = PyUnicode_GET_LENGTH(rep);
7647 if (outsize != 1) {
7648 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7649 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7650 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7651 Py_DECREF(rep);
7652 goto error;
7653 }
7654 out = PyBytes_AS_STRING(*outbytes) + offset;
7655 }
7656 kind = PyUnicode_KIND(rep);
7657 data = PyUnicode_DATA(rep);
7658 for (i=0; i < outsize; i++) {
7659 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7660 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007661 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007662 encoding, unicode,
7663 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 "unable to encode error handler result to ASCII");
7665 Py_DECREF(rep);
7666 goto error;
7667 }
7668 *out = (unsigned char)ch;
7669 out++;
7670 }
7671 }
7672 Py_DECREF(rep);
7673 }
7674 /* write a NUL byte */
7675 *out = 0;
7676 outsize = out - PyBytes_AS_STRING(*outbytes);
7677 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7678 if (_PyBytes_Resize(outbytes, outsize) < 0)
7679 goto error;
7680 ret = 0;
7681
7682error:
7683 Py_XDECREF(encoding_obj);
7684 Py_XDECREF(errorHandler);
7685 Py_XDECREF(exc);
7686 return ret;
7687}
7688
Victor Stinner3a50e702011-10-18 21:21:00 +02007689static PyObject *
7690encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007691 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 const char *errors)
7693{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007694 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007696 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007697 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007698
Victor Stinner29dacf22015-01-26 16:41:32 +01007699 if (!PyUnicode_Check(unicode)) {
7700 PyErr_BadArgument();
7701 return NULL;
7702 }
7703
Benjamin Petersonbac79492012-01-14 13:34:47 -05007704 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007705 return NULL;
7706 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007707
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 if (code_page < 0) {
7709 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7710 return NULL;
7711 }
7712
Martin v. Löwis3d325192011-11-04 18:23:06 +01007713 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007714 return PyBytes_FromStringAndSize(NULL, 0);
7715
Victor Stinner7581cef2011-11-03 22:32:33 +01007716 offset = 0;
7717 do
7718 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007719#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007720 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 chunks. */
7722 if (len > INT_MAX/2) {
7723 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007724 done = 0;
7725 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007726 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007727#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007728 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007729 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007730 done = 1;
7731 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007732
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007734 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 errors);
7736 if (ret == -2)
7737 ret = encode_code_page_errors(code_page, &outbytes,
7738 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007739 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007740 if (ret < 0) {
7741 Py_XDECREF(outbytes);
7742 return NULL;
7743 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007744
Victor Stinner7581cef2011-11-03 22:32:33 +01007745 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007747 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007748
Victor Stinner3a50e702011-10-18 21:21:00 +02007749 return outbytes;
7750}
7751
7752PyObject *
7753PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7754 Py_ssize_t size,
7755 const char *errors)
7756{
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 PyObject *unicode, *res;
7758 unicode = PyUnicode_FromUnicode(p, size);
7759 if (unicode == NULL)
7760 return NULL;
7761 res = encode_code_page(CP_ACP, unicode, errors);
7762 Py_DECREF(unicode);
7763 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007764}
7765
7766PyObject *
7767PyUnicode_EncodeCodePage(int code_page,
7768 PyObject *unicode,
7769 const char *errors)
7770{
Victor Stinner7581cef2011-11-03 22:32:33 +01007771 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007772}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007773
Alexander Belopolsky40018472011-02-26 01:02:56 +00007774PyObject *
7775PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007776{
Victor Stinner7581cef2011-11-03 22:32:33 +01007777 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007778}
7779
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007780#undef NEED_RETRY
7781
Steve Dowercc16be82016-09-08 10:35:16 -07007782#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007783
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784/* --- Character Mapping Codec -------------------------------------------- */
7785
Victor Stinnerfb161b12013-04-18 01:44:27 +02007786static int
7787charmap_decode_string(const char *s,
7788 Py_ssize_t size,
7789 PyObject *mapping,
7790 const char *errors,
7791 _PyUnicodeWriter *writer)
7792{
7793 const char *starts = s;
7794 const char *e;
7795 Py_ssize_t startinpos, endinpos;
7796 PyObject *errorHandler = NULL, *exc = NULL;
7797 Py_ssize_t maplen;
7798 enum PyUnicode_Kind mapkind;
7799 void *mapdata;
7800 Py_UCS4 x;
7801 unsigned char ch;
7802
7803 if (PyUnicode_READY(mapping) == -1)
7804 return -1;
7805
7806 maplen = PyUnicode_GET_LENGTH(mapping);
7807 mapdata = PyUnicode_DATA(mapping);
7808 mapkind = PyUnicode_KIND(mapping);
7809
7810 e = s + size;
7811
7812 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7813 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7814 * is disabled in encoding aliases, latin1 is preferred because
7815 * its implementation is faster. */
7816 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7817 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7818 Py_UCS4 maxchar = writer->maxchar;
7819
7820 assert (writer->kind == PyUnicode_1BYTE_KIND);
7821 while (s < e) {
7822 ch = *s;
7823 x = mapdata_ucs1[ch];
7824 if (x > maxchar) {
7825 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7826 goto onError;
7827 maxchar = writer->maxchar;
7828 outdata = (Py_UCS1 *)writer->data;
7829 }
7830 outdata[writer->pos] = x;
7831 writer->pos++;
7832 ++s;
7833 }
7834 return 0;
7835 }
7836
7837 while (s < e) {
7838 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7839 enum PyUnicode_Kind outkind = writer->kind;
7840 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7841 if (outkind == PyUnicode_1BYTE_KIND) {
7842 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7843 Py_UCS4 maxchar = writer->maxchar;
7844 while (s < e) {
7845 ch = *s;
7846 x = mapdata_ucs2[ch];
7847 if (x > maxchar)
7848 goto Error;
7849 outdata[writer->pos] = x;
7850 writer->pos++;
7851 ++s;
7852 }
7853 break;
7854 }
7855 else if (outkind == PyUnicode_2BYTE_KIND) {
7856 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7857 while (s < e) {
7858 ch = *s;
7859 x = mapdata_ucs2[ch];
7860 if (x == 0xFFFE)
7861 goto Error;
7862 outdata[writer->pos] = x;
7863 writer->pos++;
7864 ++s;
7865 }
7866 break;
7867 }
7868 }
7869 ch = *s;
7870
7871 if (ch < maplen)
7872 x = PyUnicode_READ(mapkind, mapdata, ch);
7873 else
7874 x = 0xfffe; /* invalid value */
7875Error:
7876 if (x == 0xfffe)
7877 {
7878 /* undefined mapping */
7879 startinpos = s-starts;
7880 endinpos = startinpos+1;
7881 if (unicode_decode_call_errorhandler_writer(
7882 errors, &errorHandler,
7883 "charmap", "character maps to <undefined>",
7884 &starts, &e, &startinpos, &endinpos, &exc, &s,
7885 writer)) {
7886 goto onError;
7887 }
7888 continue;
7889 }
7890
7891 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7892 goto onError;
7893 ++s;
7894 }
7895 Py_XDECREF(errorHandler);
7896 Py_XDECREF(exc);
7897 return 0;
7898
7899onError:
7900 Py_XDECREF(errorHandler);
7901 Py_XDECREF(exc);
7902 return -1;
7903}
7904
7905static int
7906charmap_decode_mapping(const char *s,
7907 Py_ssize_t size,
7908 PyObject *mapping,
7909 const char *errors,
7910 _PyUnicodeWriter *writer)
7911{
7912 const char *starts = s;
7913 const char *e;
7914 Py_ssize_t startinpos, endinpos;
7915 PyObject *errorHandler = NULL, *exc = NULL;
7916 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007917 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007918
7919 e = s + size;
7920
7921 while (s < e) {
7922 ch = *s;
7923
7924 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7925 key = PyLong_FromLong((long)ch);
7926 if (key == NULL)
7927 goto onError;
7928
7929 item = PyObject_GetItem(mapping, key);
7930 Py_DECREF(key);
7931 if (item == NULL) {
7932 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7933 /* No mapping found means: mapping is undefined. */
7934 PyErr_Clear();
7935 goto Undefined;
7936 } else
7937 goto onError;
7938 }
7939
7940 /* Apply mapping */
7941 if (item == Py_None)
7942 goto Undefined;
7943 if (PyLong_Check(item)) {
7944 long value = PyLong_AS_LONG(item);
7945 if (value == 0xFFFE)
7946 goto Undefined;
7947 if (value < 0 || value > MAX_UNICODE) {
7948 PyErr_Format(PyExc_TypeError,
7949 "character mapping must be in range(0x%lx)",
7950 (unsigned long)MAX_UNICODE + 1);
7951 goto onError;
7952 }
7953
7954 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7955 goto onError;
7956 }
7957 else if (PyUnicode_Check(item)) {
7958 if (PyUnicode_READY(item) == -1)
7959 goto onError;
7960 if (PyUnicode_GET_LENGTH(item) == 1) {
7961 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7962 if (value == 0xFFFE)
7963 goto Undefined;
7964 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7965 goto onError;
7966 }
7967 else {
7968 writer->overallocate = 1;
7969 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7970 goto onError;
7971 }
7972 }
7973 else {
7974 /* wrong return value */
7975 PyErr_SetString(PyExc_TypeError,
7976 "character mapping must return integer, None or str");
7977 goto onError;
7978 }
7979 Py_CLEAR(item);
7980 ++s;
7981 continue;
7982
7983Undefined:
7984 /* undefined mapping */
7985 Py_CLEAR(item);
7986 startinpos = s-starts;
7987 endinpos = startinpos+1;
7988 if (unicode_decode_call_errorhandler_writer(
7989 errors, &errorHandler,
7990 "charmap", "character maps to <undefined>",
7991 &starts, &e, &startinpos, &endinpos, &exc, &s,
7992 writer)) {
7993 goto onError;
7994 }
7995 }
7996 Py_XDECREF(errorHandler);
7997 Py_XDECREF(exc);
7998 return 0;
7999
8000onError:
8001 Py_XDECREF(item);
8002 Py_XDECREF(errorHandler);
8003 Py_XDECREF(exc);
8004 return -1;
8005}
8006
Alexander Belopolsky40018472011-02-26 01:02:56 +00008007PyObject *
8008PyUnicode_DecodeCharmap(const char *s,
8009 Py_ssize_t size,
8010 PyObject *mapping,
8011 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008013 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008014
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 /* Default to Latin-1 */
8016 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008020 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008021 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008022 writer.min_length = size;
8023 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008025
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008026 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008027 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8028 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008029 }
8030 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008031 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008034 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008035
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008037 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 return NULL;
8039}
8040
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041/* Charmap encoding: the lookup table */
8042
Alexander Belopolsky40018472011-02-26 01:02:56 +00008043struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 PyObject_HEAD
8045 unsigned char level1[32];
8046 int count2, count3;
8047 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048};
8049
8050static PyObject*
8051encoding_map_size(PyObject *obj, PyObject* args)
8052{
8053 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008054 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056}
8057
8058static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008059 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 PyDoc_STR("Return the size (in bytes) of this object") },
8061 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062};
8063
8064static void
8065encoding_map_dealloc(PyObject* o)
8066{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068}
8069
8070static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 "EncodingMap", /*tp_name*/
8073 sizeof(struct encoding_map), /*tp_basicsize*/
8074 0, /*tp_itemsize*/
8075 /* methods */
8076 encoding_map_dealloc, /*tp_dealloc*/
8077 0, /*tp_print*/
8078 0, /*tp_getattr*/
8079 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008080 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 0, /*tp_repr*/
8082 0, /*tp_as_number*/
8083 0, /*tp_as_sequence*/
8084 0, /*tp_as_mapping*/
8085 0, /*tp_hash*/
8086 0, /*tp_call*/
8087 0, /*tp_str*/
8088 0, /*tp_getattro*/
8089 0, /*tp_setattro*/
8090 0, /*tp_as_buffer*/
8091 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8092 0, /*tp_doc*/
8093 0, /*tp_traverse*/
8094 0, /*tp_clear*/
8095 0, /*tp_richcompare*/
8096 0, /*tp_weaklistoffset*/
8097 0, /*tp_iter*/
8098 0, /*tp_iternext*/
8099 encoding_map_methods, /*tp_methods*/
8100 0, /*tp_members*/
8101 0, /*tp_getset*/
8102 0, /*tp_base*/
8103 0, /*tp_dict*/
8104 0, /*tp_descr_get*/
8105 0, /*tp_descr_set*/
8106 0, /*tp_dictoffset*/
8107 0, /*tp_init*/
8108 0, /*tp_alloc*/
8109 0, /*tp_new*/
8110 0, /*tp_free*/
8111 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008112};
8113
8114PyObject*
8115PyUnicode_BuildEncodingMap(PyObject* string)
8116{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 PyObject *result;
8118 struct encoding_map *mresult;
8119 int i;
8120 int need_dict = 0;
8121 unsigned char level1[32];
8122 unsigned char level2[512];
8123 unsigned char *mlevel1, *mlevel2, *mlevel3;
8124 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008125 int kind;
8126 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008127 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008130 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131 PyErr_BadArgument();
8132 return NULL;
8133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 kind = PyUnicode_KIND(string);
8135 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008136 length = PyUnicode_GET_LENGTH(string);
8137 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 memset(level1, 0xFF, sizeof level1);
8139 memset(level2, 0xFF, sizeof level2);
8140
8141 /* If there isn't a one-to-one mapping of NULL to \0,
8142 or if there are non-BMP characters, we need to use
8143 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008146 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 ch = PyUnicode_READ(kind, data, i);
8149 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 need_dict = 1;
8151 break;
8152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 /* unmapped character */
8155 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 l1 = ch >> 11;
8157 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158 if (level1[l1] == 0xFF)
8159 level1[l1] = count2++;
8160 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 }
8163
8164 if (count2 >= 0xFF || count3 >= 0xFF)
8165 need_dict = 1;
8166
8167 if (need_dict) {
8168 PyObject *result = PyDict_New();
8169 PyObject *key, *value;
8170 if (!result)
8171 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008172 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008174 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008175 if (!key || !value)
8176 goto failed1;
8177 if (PyDict_SetItem(result, key, value) == -1)
8178 goto failed1;
8179 Py_DECREF(key);
8180 Py_DECREF(value);
8181 }
8182 return result;
8183 failed1:
8184 Py_XDECREF(key);
8185 Py_XDECREF(value);
8186 Py_DECREF(result);
8187 return NULL;
8188 }
8189
8190 /* Create a three-level trie */
8191 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8192 16*count2 + 128*count3 - 1);
8193 if (!result)
8194 return PyErr_NoMemory();
8195 PyObject_Init(result, &EncodingMapType);
8196 mresult = (struct encoding_map*)result;
8197 mresult->count2 = count2;
8198 mresult->count3 = count3;
8199 mlevel1 = mresult->level1;
8200 mlevel2 = mresult->level23;
8201 mlevel3 = mresult->level23 + 16*count2;
8202 memcpy(mlevel1, level1, 32);
8203 memset(mlevel2, 0xFF, 16*count2);
8204 memset(mlevel3, 0, 128*count3);
8205 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008206 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008207 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008208 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8209 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 /* unmapped character */
8211 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008212 o1 = ch>>11;
8213 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 i2 = 16*mlevel1[o1] + o2;
8215 if (mlevel2[i2] == 0xFF)
8216 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008217 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218 i3 = 128*mlevel2[i2] + o3;
8219 mlevel3[i3] = i;
8220 }
8221 return result;
8222}
8223
8224static int
Victor Stinner22168992011-11-20 17:09:18 +01008225encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226{
8227 struct encoding_map *map = (struct encoding_map*)mapping;
8228 int l1 = c>>11;
8229 int l2 = (c>>7) & 0xF;
8230 int l3 = c & 0x7F;
8231 int i;
8232
Victor Stinner22168992011-11-20 17:09:18 +01008233 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008235 if (c == 0)
8236 return 0;
8237 /* level 1*/
8238 i = map->level1[l1];
8239 if (i == 0xFF) {
8240 return -1;
8241 }
8242 /* level 2*/
8243 i = map->level23[16*i+l2];
8244 if (i == 0xFF) {
8245 return -1;
8246 }
8247 /* level 3 */
8248 i = map->level23[16*map->count2 + 128*i + l3];
8249 if (i == 0) {
8250 return -1;
8251 }
8252 return i;
8253}
8254
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255/* Lookup the character ch in the mapping. If the character
8256 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008257 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008258static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008259charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260{
Christian Heimes217cfd12007-12-02 14:31:20 +00008261 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 PyObject *x;
8263
8264 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 x = PyObject_GetItem(mapping, w);
8267 Py_DECREF(w);
8268 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8270 /* No mapping found means: mapping is undefined. */
8271 PyErr_Clear();
8272 x = Py_None;
8273 Py_INCREF(x);
8274 return x;
8275 } else
8276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008278 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008280 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 long value = PyLong_AS_LONG(x);
8282 if (value < 0 || value > 255) {
8283 PyErr_SetString(PyExc_TypeError,
8284 "character mapping must be in range(256)");
8285 Py_DECREF(x);
8286 return NULL;
8287 }
8288 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 /* wrong return value */
8294 PyErr_Format(PyExc_TypeError,
8295 "character mapping must return integer, bytes or None, not %.400s",
8296 x->ob_type->tp_name);
8297 Py_DECREF(x);
8298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
8300}
8301
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008302static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008303charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008305 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8306 /* exponentially overallocate to minimize reallocations */
8307 if (requiredsize < 2*outsize)
8308 requiredsize = 2*outsize;
8309 if (_PyBytes_Resize(outobj, requiredsize))
8310 return -1;
8311 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312}
8313
Benjamin Peterson14339b62009-01-31 16:36:08 +00008314typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008316} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008318 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 space is available. Return a new reference to the object that
8320 was put in the output buffer, or Py_None, if the mapping was undefined
8321 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008322 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008324charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008327 PyObject *rep;
8328 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008329 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330
Christian Heimes90aa7642007-12-19 02:45:37 +00008331 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 if (res == -1)
8335 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (outsize<requiredsize)
8337 if (charmapencode_resize(outobj, outpos, requiredsize))
8338 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008339 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 outstart[(*outpos)++] = (char)res;
8341 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 }
8343
8344 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 Py_DECREF(rep);
8349 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 if (PyLong_Check(rep)) {
8352 Py_ssize_t requiredsize = *outpos+1;
8353 if (outsize<requiredsize)
8354 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8355 Py_DECREF(rep);
8356 return enc_EXCEPTION;
8357 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008358 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008360 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 else {
8362 const char *repchars = PyBytes_AS_STRING(rep);
8363 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8364 Py_ssize_t requiredsize = *outpos+repsize;
8365 if (outsize<requiredsize)
8366 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8367 Py_DECREF(rep);
8368 return enc_EXCEPTION;
8369 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008370 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 memcpy(outstart + *outpos, repchars, repsize);
8372 *outpos += repsize;
8373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 Py_DECREF(rep);
8376 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377}
8378
8379/* handle an error in PyUnicode_EncodeCharmap
8380 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008381static int
8382charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008385 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008386 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387{
8388 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008390 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008391 enum PyUnicode_Kind kind;
8392 void *data;
8393 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 Py_ssize_t collstartpos = *inpos;
8396 Py_ssize_t collendpos = *inpos+1;
8397 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 char *encoding = "charmap";
8399 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008400 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008401 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008402 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403
Benjamin Petersonbac79492012-01-14 13:34:47 -05008404 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008405 return -1;
8406 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 /* find all unencodable characters */
8408 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008409 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008410 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008412 val = encoding_map_lookup(ch, mapping);
8413 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 break;
8415 ++collendpos;
8416 continue;
8417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8420 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 if (rep==NULL)
8422 return -1;
8423 else if (rep!=Py_None) {
8424 Py_DECREF(rep);
8425 break;
8426 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430 /* cache callback name lookup
8431 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008432 if (*error_handler == _Py_ERROR_UNKNOWN)
8433 *error_handler = get_error_handler(errors);
8434
8435 switch (*error_handler) {
8436 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008437 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008439
8440 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 x = charmapencode_output('?', mapping, res, respos);
8443 if (x==enc_EXCEPTION) {
8444 return -1;
8445 }
8446 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008447 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return -1;
8449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 }
8451 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008452 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 *inpos = collendpos;
8454 break;
Victor Stinner50149202015-09-22 00:26:54 +02008455
8456 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 /* generate replacement (temporarily (mis)uses p) */
8458 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 char buffer[2+29+1+1];
8460 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008461 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 for (cp = buffer; *cp; ++cp) {
8463 x = charmapencode_output(*cp, mapping, res, respos);
8464 if (x==enc_EXCEPTION)
8465 return -1;
8466 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008467 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return -1;
8469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 }
8471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 *inpos = collendpos;
8473 break;
Victor Stinner50149202015-09-22 00:26:54 +02008474
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 default:
Victor Stinner50149202015-09-22 00:26:54 +02008476 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008477 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008481 if (PyBytes_Check(repunicode)) {
8482 /* Directly copy bytes result to output. */
8483 Py_ssize_t outsize = PyBytes_Size(*res);
8484 Py_ssize_t requiredsize;
8485 repsize = PyBytes_Size(repunicode);
8486 requiredsize = *respos + repsize;
8487 if (requiredsize > outsize)
8488 /* Make room for all additional bytes. */
8489 if (charmapencode_resize(res, respos, requiredsize)) {
8490 Py_DECREF(repunicode);
8491 return -1;
8492 }
8493 memcpy(PyBytes_AsString(*res) + *respos,
8494 PyBytes_AsString(repunicode), repsize);
8495 *respos += repsize;
8496 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008497 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008498 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008501 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008502 Py_DECREF(repunicode);
8503 return -1;
8504 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008505 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008506 data = PyUnicode_DATA(repunicode);
8507 kind = PyUnicode_KIND(repunicode);
8508 for (index = 0; index < repsize; index++) {
8509 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8510 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return -1;
8514 }
8515 else if (x==enc_FAILED) {
8516 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008517 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return -1;
8519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520 }
8521 *inpos = newpos;
8522 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 }
8524 return 0;
8525}
8526
Alexander Belopolsky40018472011-02-26 01:02:56 +00008527PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528_PyUnicode_EncodeCharmap(PyObject *unicode,
8529 PyObject *mapping,
8530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 /* output object */
8533 PyObject *res = NULL;
8534 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008535 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008539 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008541 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008542 void *data;
8543 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
Benjamin Petersonbac79492012-01-14 13:34:47 -05008545 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546 return NULL;
8547 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008548 data = PyUnicode_DATA(unicode);
8549 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 /* Default to Latin-1 */
8552 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 /* allocate enough for a simple encoding without
8556 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008557 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 if (res == NULL)
8559 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008560 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008564 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 if (x==enc_EXCEPTION) /* error */
8568 goto onError;
8569 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008570 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008572 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 &res, &respos)) {
8574 goto onError;
8575 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 else
8578 /* done with this character => adjust input position */
8579 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008583 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008584 if (_PyBytes_Resize(&res, respos) < 0)
8585 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008588 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 return res;
8590
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 Py_XDECREF(res);
8593 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008594 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 return NULL;
8596}
8597
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008598/* Deprecated */
8599PyObject *
8600PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8601 Py_ssize_t size,
8602 PyObject *mapping,
8603 const char *errors)
8604{
8605 PyObject *result;
8606 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8607 if (unicode == NULL)
8608 return NULL;
8609 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8610 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008611 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612}
8613
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614PyObject *
8615PyUnicode_AsCharmapString(PyObject *unicode,
8616 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617{
8618 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 PyErr_BadArgument();
8620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008622 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623}
8624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626static void
8627make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629 Py_ssize_t startpos, Py_ssize_t endpos,
8630 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 *exceptionObject = _PyUnicodeTranslateError_Create(
8634 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
8636 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8638 goto onError;
8639 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8640 goto onError;
8641 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8642 goto onError;
8643 return;
8644 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008645 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
8647}
8648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649/* error handling callback helper:
8650 build arguments, call the callback and check the arguments,
8651 put the result into newpos and return the replacement string, which
8652 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008653static PyObject *
8654unicode_translate_call_errorhandler(const char *errors,
8655 PyObject **errorHandler,
8656 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658 Py_ssize_t startpos, Py_ssize_t endpos,
8659 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008661 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008663 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 PyObject *restuple;
8665 PyObject *resunicode;
8666
8667 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 }
8672
8673 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677
8678 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008683 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 Py_DECREF(restuple);
8685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 }
8687 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 &resunicode, &i_newpos)) {
8689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 else
8695 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008697 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 Py_DECREF(restuple);
8699 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 Py_INCREF(resunicode);
8702 Py_DECREF(restuple);
8703 return resunicode;
8704}
8705
8706/* Lookup the character ch in the mapping and put the result in result,
8707 which must be decrefed by the caller.
8708 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008709static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711{
Christian Heimes217cfd12007-12-02 14:31:20 +00008712 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 PyObject *x;
8714
8715 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 x = PyObject_GetItem(mapping, w);
8718 Py_DECREF(w);
8719 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8721 /* No mapping found means: use 1:1 mapping. */
8722 PyErr_Clear();
8723 *result = NULL;
8724 return 0;
8725 } else
8726 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
8728 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 *result = x;
8730 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008732 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008734 if (value < 0 || value > MAX_UNICODE) {
8735 PyErr_Format(PyExc_ValueError,
8736 "character mapping must be in range(0x%x)",
8737 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 Py_DECREF(x);
8739 return -1;
8740 }
8741 *result = x;
8742 return 0;
8743 }
8744 else if (PyUnicode_Check(x)) {
8745 *result = x;
8746 return 0;
8747 }
8748 else {
8749 /* wrong return value */
8750 PyErr_SetString(PyExc_TypeError,
8751 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752 Py_DECREF(x);
8753 return -1;
8754 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755}
Victor Stinner1194ea02014-04-04 19:37:40 +02008756
8757/* lookup the character, write the result into the writer.
8758 Return 1 if the result was written into the writer, return 0 if the mapping
8759 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008760static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008761charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8762 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763{
Victor Stinner1194ea02014-04-04 19:37:40 +02008764 PyObject *item;
8765
8766 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008768
8769 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008776
8777 if (item == Py_None) {
8778 Py_DECREF(item);
8779 return 0;
8780 }
8781
8782 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008783 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8784 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8785 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008786 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8787 Py_DECREF(item);
8788 return -1;
8789 }
8790 Py_DECREF(item);
8791 return 1;
8792 }
8793
8794 if (!PyUnicode_Check(item)) {
8795 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008797 }
8798
8799 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8800 Py_DECREF(item);
8801 return -1;
8802 }
8803
8804 Py_DECREF(item);
8805 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008806}
8807
Victor Stinner89a76ab2014-04-05 11:44:04 +02008808static int
8809unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8810 Py_UCS1 *translate)
8811{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008812 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813 int ret = 0;
8814
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815 if (charmaptranslate_lookup(ch, mapping, &item)) {
8816 return -1;
8817 }
8818
8819 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008820 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008821 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008823 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 /* not found => default to 1:1 mapping */
8825 translate[ch] = ch;
8826 return 1;
8827 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008828 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008829 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008830 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8831 used it */
8832 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008833 /* invalid character or character outside ASCII:
8834 skip the fast translate */
8835 goto exit;
8836 }
8837 translate[ch] = (Py_UCS1)replace;
8838 }
8839 else if (PyUnicode_Check(item)) {
8840 Py_UCS4 replace;
8841
8842 if (PyUnicode_READY(item) == -1) {
8843 Py_DECREF(item);
8844 return -1;
8845 }
8846 if (PyUnicode_GET_LENGTH(item) != 1)
8847 goto exit;
8848
8849 replace = PyUnicode_READ_CHAR(item, 0);
8850 if (replace > 127)
8851 goto exit;
8852 translate[ch] = (Py_UCS1)replace;
8853 }
8854 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008855 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 goto exit;
8857 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 ret = 1;
8859
Benjamin Peterson1365de72014-04-07 20:15:41 -04008860 exit:
8861 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 return ret;
8863}
8864
8865/* Fast path for ascii => ascii translation. Return 1 if the whole string
8866 was translated into writer, return 0 if the input string was partially
8867 translated into writer, raise an exception and return -1 on error. */
8868static int
8869unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008870 _PyUnicodeWriter *writer, int ignore,
8871 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872{
Victor Stinner872b2912014-04-05 14:27:07 +02008873 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874 Py_ssize_t len;
8875 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008876 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878 len = PyUnicode_GET_LENGTH(input);
8879
Victor Stinner872b2912014-04-05 14:27:07 +02008880 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008881
8882 in = PyUnicode_1BYTE_DATA(input);
8883 end = in + len;
8884
8885 assert(PyUnicode_IS_ASCII(writer->buffer));
8886 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8887 out = PyUnicode_1BYTE_DATA(writer->buffer);
8888
Victor Stinner872b2912014-04-05 14:27:07 +02008889 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008891 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008893 int translate = unicode_fast_translate_lookup(mapping, ch,
8894 ascii_table);
8895 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008897 if (translate == 0)
8898 goto exit;
8899 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 }
Victor Stinner872b2912014-04-05 14:27:07 +02008901 if (ch2 == 0xfe) {
8902 if (ignore)
8903 continue;
8904 goto exit;
8905 }
8906 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008907 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008908 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909 }
Victor Stinner872b2912014-04-05 14:27:07 +02008910 res = 1;
8911
8912exit:
8913 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008914 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008915 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916}
8917
Victor Stinner3222da22015-10-01 22:07:32 +02008918static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919_PyUnicode_TranslateCharmap(PyObject *input,
8920 PyObject *mapping,
8921 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008924 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 Py_ssize_t size, i;
8926 int kind;
8927 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008928 _PyUnicodeWriter writer;
8929 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 char *reason = "character maps to <undefined>";
8931 PyObject *errorHandler = NULL;
8932 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 PyErr_BadArgument();
8938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 if (PyUnicode_READY(input) == -1)
8942 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008943 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 kind = PyUnicode_KIND(input);
8945 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008947 if (size == 0)
8948 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008950 /* allocate enough for a simple 1:1 translation without
8951 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008952 _PyUnicodeWriter_Init(&writer);
8953 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955
Victor Stinner872b2912014-04-05 14:27:07 +02008956 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8957
Victor Stinner33798672016-03-01 21:59:58 +01008958 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008959 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008960 if (PyUnicode_IS_ASCII(input)) {
8961 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8962 if (res < 0) {
8963 _PyUnicodeWriter_Dealloc(&writer);
8964 return NULL;
8965 }
8966 if (res == 1)
8967 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 }
Victor Stinner33798672016-03-01 21:59:58 +01008969 else {
8970 i = 0;
8971 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008975 int translate;
8976 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8977 Py_ssize_t newpos;
8978 /* startpos for collecting untranslatable chars */
8979 Py_ssize_t collstart;
8980 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 ch = PyUnicode_READ(kind, data, i);
8984 translate = charmaptranslate_output(ch, mapping, &writer);
8985 if (translate < 0)
8986 goto onError;
8987
8988 if (translate != 0) {
8989 /* it worked => adjust input pointer */
8990 ++i;
8991 continue;
8992 }
8993
8994 /* untranslatable character */
8995 collstart = i;
8996 collend = i+1;
8997
8998 /* find all untranslatable characters */
8999 while (collend < size) {
9000 PyObject *x;
9001 ch = PyUnicode_READ(kind, data, collend);
9002 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009003 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 Py_XDECREF(x);
9005 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 ++collend;
9008 }
9009
9010 if (ignore) {
9011 i = collend;
9012 }
9013 else {
9014 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9015 reason, input, &exc,
9016 collstart, collend, &newpos);
9017 if (repunicode == NULL)
9018 goto onError;
9019 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009021 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009022 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009023 Py_DECREF(repunicode);
9024 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009025 }
9026 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027 Py_XDECREF(exc);
9028 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009032 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033 Py_XDECREF(exc);
9034 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035 return NULL;
9036}
9037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038/* Deprecated. Use PyUnicode_Translate instead. */
9039PyObject *
9040PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9041 Py_ssize_t size,
9042 PyObject *mapping,
9043 const char *errors)
9044{
Christian Heimes5f520f42012-09-11 14:03:25 +02009045 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9047 if (!unicode)
9048 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009049 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9050 Py_DECREF(unicode);
9051 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052}
9053
Alexander Belopolsky40018472011-02-26 01:02:56 +00009054PyObject *
9055PyUnicode_Translate(PyObject *str,
9056 PyObject *mapping,
9057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009059 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009060 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009061 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062}
Tim Petersced69f82003-09-16 20:30:58 +00009063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009065fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066{
9067 /* No need to call PyUnicode_READY(self) because this function is only
9068 called as a callback from fixup() which does it already. */
9069 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9070 const int kind = PyUnicode_KIND(self);
9071 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009072 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009073 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 Py_ssize_t i;
9075
9076 for (i = 0; i < len; ++i) {
9077 ch = PyUnicode_READ(kind, data, i);
9078 fixed = 0;
9079 if (ch > 127) {
9080 if (Py_UNICODE_ISSPACE(ch))
9081 fixed = ' ';
9082 else {
9083 const int decimal = Py_UNICODE_TODECIMAL(ch);
9084 if (decimal >= 0)
9085 fixed = '0' + decimal;
9086 }
9087 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009088 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009089 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 PyUnicode_WRITE(kind, data, i, fixed);
9091 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009092 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009093 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 }
9096
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009097 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098}
9099
9100PyObject *
9101_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9102{
9103 if (!PyUnicode_Check(unicode)) {
9104 PyErr_BadInternalCall();
9105 return NULL;
9106 }
9107 if (PyUnicode_READY(unicode) == -1)
9108 return NULL;
9109 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9110 /* If the string is already ASCII, just return the same string */
9111 Py_INCREF(unicode);
9112 return unicode;
9113 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009114 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115}
9116
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009117PyObject *
9118PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9119 Py_ssize_t length)
9120{
Victor Stinnerf0124502011-11-21 23:12:56 +01009121 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009122 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009123 Py_UCS4 maxchar;
9124 enum PyUnicode_Kind kind;
9125 void *data;
9126
Victor Stinner99d7ad02012-02-22 13:37:39 +01009127 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009128 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009129 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009130 if (ch > 127) {
9131 int decimal = Py_UNICODE_TODECIMAL(ch);
9132 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009133 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009134 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009135 }
9136 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009137
9138 /* Copy to a new string */
9139 decimal = PyUnicode_New(length, maxchar);
9140 if (decimal == NULL)
9141 return decimal;
9142 kind = PyUnicode_KIND(decimal);
9143 data = PyUnicode_DATA(decimal);
9144 /* Iterate over code points */
9145 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009146 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009147 if (ch > 127) {
9148 int decimal = Py_UNICODE_TODECIMAL(ch);
9149 if (decimal >= 0)
9150 ch = '0' + decimal;
9151 }
9152 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009154 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009155}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009156/* --- Decimal Encoder ---------------------------------------------------- */
9157
Alexander Belopolsky40018472011-02-26 01:02:56 +00009158int
9159PyUnicode_EncodeDecimal(Py_UNICODE *s,
9160 Py_ssize_t length,
9161 char *output,
9162 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009163{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009164 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009165 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009166 enum PyUnicode_Kind kind;
9167 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009168
9169 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 PyErr_BadArgument();
9171 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009172 }
9173
Victor Stinner42bf7752011-11-21 22:52:58 +01009174 unicode = PyUnicode_FromUnicode(s, length);
9175 if (unicode == NULL)
9176 return -1;
9177
Benjamin Petersonbac79492012-01-14 13:34:47 -05009178 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009179 Py_DECREF(unicode);
9180 return -1;
9181 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009182 kind = PyUnicode_KIND(unicode);
9183 data = PyUnicode_DATA(unicode);
9184
Victor Stinnerb84d7232011-11-22 01:50:07 +01009185 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009186 PyObject *exc;
9187 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009189 Py_ssize_t startpos;
9190
9191 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009192
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009195 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009197 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 decimal = Py_UNICODE_TODECIMAL(ch);
9199 if (decimal >= 0) {
9200 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009201 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 continue;
9203 }
9204 if (0 < ch && ch < 256) {
9205 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009206 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 continue;
9208 }
Victor Stinner6345be92011-11-25 20:09:01 +01009209
Victor Stinner42bf7752011-11-21 22:52:58 +01009210 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009211 exc = NULL;
9212 raise_encode_exception(&exc, "decimal", unicode,
9213 startpos, startpos+1,
9214 "invalid decimal Unicode string");
9215 Py_XDECREF(exc);
9216 Py_DECREF(unicode);
9217 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009218 }
9219 /* 0-terminate the output string */
9220 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009221 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009222 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009223}
9224
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225/* --- Helpers ------------------------------------------------------------ */
9226
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009227/* helper macro to fixup start/end slice values */
9228#define ADJUST_INDICES(start, end, len) \
9229 if (end > len) \
9230 end = len; \
9231 else if (end < 0) { \
9232 end += len; \
9233 if (end < 0) \
9234 end = 0; \
9235 } \
9236 if (start < 0) { \
9237 start += len; \
9238 if (start < 0) \
9239 start = 0; \
9240 }
9241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009243any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009245 Py_ssize_t end,
9246 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009248 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 void *buf1, *buf2;
9250 Py_ssize_t len1, len2, result;
9251
9252 kind1 = PyUnicode_KIND(s1);
9253 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009254 if (kind1 < kind2)
9255 return -1;
9256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 len1 = PyUnicode_GET_LENGTH(s1);
9258 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009259 ADJUST_INDICES(start, end, len1);
9260 if (end - start < len2)
9261 return -1;
9262
9263 buf1 = PyUnicode_DATA(s1);
9264 buf2 = PyUnicode_DATA(s2);
9265 if (len2 == 1) {
9266 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9267 result = findchar((const char *)buf1 + kind1*start,
9268 kind1, end - start, ch, direction);
9269 if (result == -1)
9270 return -1;
9271 else
9272 return start + result;
9273 }
9274
9275 if (kind2 != kind1) {
9276 buf2 = _PyUnicode_AsKind(s2, kind1);
9277 if (!buf2)
9278 return -2;
9279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280
Victor Stinner794d5672011-10-10 03:21:36 +02009281 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009282 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009283 case PyUnicode_1BYTE_KIND:
9284 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9285 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9286 else
9287 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_2BYTE_KIND:
9290 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 case PyUnicode_4BYTE_KIND:
9293 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9294 break;
9295 default:
9296 assert(0); result = -2;
9297 }
9298 }
9299 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009300 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009301 case PyUnicode_1BYTE_KIND:
9302 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9303 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 else
9305 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_2BYTE_KIND:
9308 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 case PyUnicode_4BYTE_KIND:
9311 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9312 break;
9313 default:
9314 assert(0); result = -2;
9315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 }
9317
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009318 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 PyMem_Free(buf2);
9320
9321 return result;
9322}
9323
9324Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009325_PyUnicode_InsertThousandsGrouping(
9326 PyObject *unicode, Py_ssize_t index,
9327 Py_ssize_t n_buffer,
9328 void *digits, Py_ssize_t n_digits,
9329 Py_ssize_t min_width,
9330 const char *grouping, PyObject *thousands_sep,
9331 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332{
Victor Stinner41a863c2012-02-24 00:37:51 +01009333 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009334 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009335 Py_ssize_t thousands_sep_len;
9336 Py_ssize_t len;
9337
9338 if (unicode != NULL) {
9339 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009340 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009341 }
9342 else {
9343 kind = PyUnicode_1BYTE_KIND;
9344 data = NULL;
9345 }
9346 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9347 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9348 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9349 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009350 if (thousands_sep_kind < kind) {
9351 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9352 if (!thousands_sep_data)
9353 return -1;
9354 }
9355 else {
9356 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9357 if (!data)
9358 return -1;
9359 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009360 }
9361
Benjamin Petersonead6b532011-12-20 17:23:42 -06009362 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009364 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009366 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009368 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009369 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009371 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009373 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009377 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009379 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009383 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009384 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009385 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 break;
9387 default:
9388 assert(0);
9389 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009391 if (unicode != NULL && thousands_sep_kind != kind) {
9392 if (thousands_sep_kind < kind)
9393 PyMem_Free(thousands_sep_data);
9394 else
9395 PyMem_Free(data);
9396 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009397 if (unicode == NULL) {
9398 *maxchar = 127;
9399 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009400 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009401 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009402 }
9403 }
9404 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405}
9406
9407
Alexander Belopolsky40018472011-02-26 01:02:56 +00009408Py_ssize_t
9409PyUnicode_Count(PyObject *str,
9410 PyObject *substr,
9411 Py_ssize_t start,
9412 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009414 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009415 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416 void *buf1 = NULL, *buf2 = NULL;
9417 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009420 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009421
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009422 kind1 = PyUnicode_KIND(str);
9423 kind2 = PyUnicode_KIND(substr);
9424 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009425 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009427 len1 = PyUnicode_GET_LENGTH(str);
9428 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 buf1 = PyUnicode_DATA(str);
9434 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009435 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009436 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437 if (!buf2)
9438 goto onError;
9439 }
9440
9441 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009443 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009444 result = asciilib_count(
9445 ((Py_UCS1*)buf1) + start, end - start,
9446 buf2, len2, PY_SSIZE_T_MAX
9447 );
9448 else
9449 result = ucs1lib_count(
9450 ((Py_UCS1*)buf1) + start, end - start,
9451 buf2, len2, PY_SSIZE_T_MAX
9452 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 break;
9454 case PyUnicode_2BYTE_KIND:
9455 result = ucs2lib_count(
9456 ((Py_UCS2*)buf1) + start, end - start,
9457 buf2, len2, PY_SSIZE_T_MAX
9458 );
9459 break;
9460 case PyUnicode_4BYTE_KIND:
9461 result = ucs4lib_count(
9462 ((Py_UCS4*)buf1) + start, end - start,
9463 buf2, len2, PY_SSIZE_T_MAX
9464 );
9465 break;
9466 default:
9467 assert(0); result = 0;
9468 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009469
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009470 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 PyMem_Free(buf2);
9472
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009475 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 PyMem_Free(buf2);
9477 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480Py_ssize_t
9481PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009482 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009483 Py_ssize_t start,
9484 Py_ssize_t end,
9485 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009489
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009490 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491}
9492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493Py_ssize_t
9494PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9495 Py_ssize_t start, Py_ssize_t end,
9496 int direction)
9497{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009499 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 if (PyUnicode_READY(str) == -1)
9501 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009502 if (start < 0 || end < 0) {
9503 PyErr_SetString(PyExc_IndexError, "string index out of range");
9504 return -2;
9505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 if (end > PyUnicode_GET_LENGTH(str))
9507 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009508 if (start >= end)
9509 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009511 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9512 kind, end-start, ch, direction);
9513 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009515 else
9516 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517}
9518
Alexander Belopolsky40018472011-02-26 01:02:56 +00009519static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009520tailmatch(PyObject *self,
9521 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009522 Py_ssize_t start,
9523 Py_ssize_t end,
9524 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 int kind_self;
9527 int kind_sub;
9528 void *data_self;
9529 void *data_sub;
9530 Py_ssize_t offset;
9531 Py_ssize_t i;
9532 Py_ssize_t end_sub;
9533
9534 if (PyUnicode_READY(self) == -1 ||
9535 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009536 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9539 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009543 if (PyUnicode_GET_LENGTH(substring) == 0)
9544 return 1;
9545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 kind_self = PyUnicode_KIND(self);
9547 data_self = PyUnicode_DATA(self);
9548 kind_sub = PyUnicode_KIND(substring);
9549 data_sub = PyUnicode_DATA(substring);
9550 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9551
9552 if (direction > 0)
9553 offset = end;
9554 else
9555 offset = start;
9556
9557 if (PyUnicode_READ(kind_self, data_self, offset) ==
9558 PyUnicode_READ(kind_sub, data_sub, 0) &&
9559 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9560 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9561 /* If both are of the same kind, memcmp is sufficient */
9562 if (kind_self == kind_sub) {
9563 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009564 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 data_sub,
9566 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009567 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009569 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 else {
9571 /* We do not need to compare 0 and len(substring)-1 because
9572 the if statement above ensured already that they are equal
9573 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574 for (i = 1; i < end_sub; ++i) {
9575 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9576 PyUnicode_READ(kind_sub, data_sub, i))
9577 return 0;
9578 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 }
9582
9583 return 0;
9584}
9585
Alexander Belopolsky40018472011-02-26 01:02:56 +00009586Py_ssize_t
9587PyUnicode_Tailmatch(PyObject *str,
9588 PyObject *substr,
9589 Py_ssize_t start,
9590 Py_ssize_t end,
9591 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009595
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009596 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597}
9598
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599/* Apply fixfct filter to the Unicode object self and return a
9600 reference to the modified object */
9601
Alexander Belopolsky40018472011-02-26 01:02:56 +00009602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009603fixup(PyObject *self,
9604 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 PyObject *u;
9607 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009608 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009610 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009613 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 /* fix functions return the new maximum character in a string,
9616 if the kind of the resulting unicode object does not change,
9617 everything is fine. Otherwise we need to change the string kind
9618 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009619 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009620
9621 if (maxchar_new == 0) {
9622 /* no changes */;
9623 if (PyUnicode_CheckExact(self)) {
9624 Py_DECREF(u);
9625 Py_INCREF(self);
9626 return self;
9627 }
9628 else
9629 return u;
9630 }
9631
Victor Stinnere6abb482012-05-02 01:15:40 +02009632 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633
Victor Stinnereaab6042011-12-11 22:22:39 +01009634 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009636
9637 /* In case the maximum character changed, we need to
9638 convert the string to the new category. */
9639 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9640 if (v == NULL) {
9641 Py_DECREF(u);
9642 return NULL;
9643 }
9644 if (maxchar_new > maxchar_old) {
9645 /* If the maxchar increased so that the kind changed, not all
9646 characters are representable anymore and we need to fix the
9647 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009648 _PyUnicode_FastCopyCharacters(v, 0,
9649 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009650 maxchar_old = fixfct(v);
9651 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 }
9653 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009654 _PyUnicode_FastCopyCharacters(v, 0,
9655 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009657 Py_DECREF(u);
9658 assert(_PyUnicode_CheckConsistency(v, 1));
9659 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660}
9661
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662static PyObject *
9663ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9666 char *resdata, *data = PyUnicode_DATA(self);
9667 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009668
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 res = PyUnicode_New(len, 127);
9670 if (res == NULL)
9671 return NULL;
9672 resdata = PyUnicode_DATA(res);
9673 if (lower)
9674 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 _Py_bytes_upper(resdata, data, len);
9677 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678}
9679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 Py_ssize_t j;
9684 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009685 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009687
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9689
9690 where ! is a negation and \p{xxx} is a character with property xxx.
9691 */
9692 for (j = i - 1; j >= 0; j--) {
9693 c = PyUnicode_READ(kind, data, j);
9694 if (!_PyUnicode_IsCaseIgnorable(c))
9695 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9698 if (final_sigma) {
9699 for (j = i + 1; j < length; j++) {
9700 c = PyUnicode_READ(kind, data, j);
9701 if (!_PyUnicode_IsCaseIgnorable(c))
9702 break;
9703 }
9704 final_sigma = j == length || !_PyUnicode_IsCased(c);
9705 }
9706 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707}
9708
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709static int
9710lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9711 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 /* Obscure special case. */
9714 if (c == 0x3A3) {
9715 mapped[0] = handle_capital_sigma(kind, data, length, i);
9716 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719}
9720
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721static Py_ssize_t
9722do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724 Py_ssize_t i, k = 0;
9725 int n_res, j;
9726 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009727
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009728 c = PyUnicode_READ(kind, data, 0);
9729 n_res = _PyUnicode_ToUpperFull(c, mapped);
9730 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009731 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 for (i = 1; i < length; i++) {
9735 c = PyUnicode_READ(kind, data, i);
9736 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9737 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009738 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009740 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009741 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743}
9744
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009745static Py_ssize_t
9746do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9747 Py_ssize_t i, k = 0;
9748
9749 for (i = 0; i < length; i++) {
9750 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9751 int n_res, j;
9752 if (Py_UNICODE_ISUPPER(c)) {
9753 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9754 }
9755 else if (Py_UNICODE_ISLOWER(c)) {
9756 n_res = _PyUnicode_ToUpperFull(c, mapped);
9757 }
9758 else {
9759 n_res = 1;
9760 mapped[0] = c;
9761 }
9762 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009763 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009764 res[k++] = mapped[j];
9765 }
9766 }
9767 return k;
9768}
9769
9770static Py_ssize_t
9771do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9772 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009774 Py_ssize_t i, k = 0;
9775
9776 for (i = 0; i < length; i++) {
9777 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9778 int n_res, j;
9779 if (lower)
9780 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9781 else
9782 n_res = _PyUnicode_ToUpperFull(c, mapped);
9783 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009784 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009785 res[k++] = mapped[j];
9786 }
9787 }
9788 return k;
9789}
9790
9791static Py_ssize_t
9792do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9793{
9794 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9795}
9796
9797static Py_ssize_t
9798do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9799{
9800 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9801}
9802
Benjamin Petersone51757f2012-01-12 21:10:29 -05009803static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009804do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9805{
9806 Py_ssize_t i, k = 0;
9807
9808 for (i = 0; i < length; i++) {
9809 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9810 Py_UCS4 mapped[3];
9811 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9812 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009813 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009814 res[k++] = mapped[j];
9815 }
9816 }
9817 return k;
9818}
9819
9820static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009821do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9822{
9823 Py_ssize_t i, k = 0;
9824 int previous_is_cased;
9825
9826 previous_is_cased = 0;
9827 for (i = 0; i < length; i++) {
9828 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9829 Py_UCS4 mapped[3];
9830 int n_res, j;
9831
9832 if (previous_is_cased)
9833 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9834 else
9835 n_res = _PyUnicode_ToTitleFull(c, mapped);
9836
9837 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009838 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009839 res[k++] = mapped[j];
9840 }
9841
9842 previous_is_cased = _PyUnicode_IsCased(c);
9843 }
9844 return k;
9845}
9846
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009847static PyObject *
9848case_operation(PyObject *self,
9849 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9850{
9851 PyObject *res = NULL;
9852 Py_ssize_t length, newlength = 0;
9853 int kind, outkind;
9854 void *data, *outdata;
9855 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9856
Benjamin Petersoneea48462012-01-16 14:28:50 -05009857 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009858
9859 kind = PyUnicode_KIND(self);
9860 data = PyUnicode_DATA(self);
9861 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009862 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009863 PyErr_SetString(PyExc_OverflowError, "string is too long");
9864 return NULL;
9865 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009866 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009867 if (tmp == NULL)
9868 return PyErr_NoMemory();
9869 newlength = perform(kind, data, length, tmp, &maxchar);
9870 res = PyUnicode_New(newlength, maxchar);
9871 if (res == NULL)
9872 goto leave;
9873 tmpend = tmp + newlength;
9874 outdata = PyUnicode_DATA(res);
9875 outkind = PyUnicode_KIND(res);
9876 switch (outkind) {
9877 case PyUnicode_1BYTE_KIND:
9878 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9879 break;
9880 case PyUnicode_2BYTE_KIND:
9881 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9882 break;
9883 case PyUnicode_4BYTE_KIND:
9884 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9885 break;
9886 default:
9887 assert(0);
9888 break;
9889 }
9890 leave:
9891 PyMem_FREE(tmp);
9892 return res;
9893}
9894
Tim Peters8ce9f162004-08-27 01:49:32 +00009895PyObject *
9896PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009898 PyObject *res;
9899 PyObject *fseq;
9900 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009901 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009903 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009905 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009906 }
9907
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009908 /* NOTE: the following code can't call back into Python code,
9909 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009910 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009911
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009912 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009913 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009914 res = _PyUnicode_JoinArray(separator, items, seqlen);
9915 Py_DECREF(fseq);
9916 return res;
9917}
9918
9919PyObject *
9920_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9921{
9922 PyObject *res = NULL; /* the result */
9923 PyObject *sep = NULL;
9924 Py_ssize_t seplen;
9925 PyObject *item;
9926 Py_ssize_t sz, i, res_offset;
9927 Py_UCS4 maxchar;
9928 Py_UCS4 item_maxchar;
9929 int use_memcpy;
9930 unsigned char *res_data = NULL, *sep_data = NULL;
9931 PyObject *last_obj;
9932 unsigned int kind = 0;
9933
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 /* If empty sequence, return u"". */
9935 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009936 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009937 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009938
Tim Peters05eba1f2004-08-27 21:32:02 +00009939 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009940 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009941 if (seqlen == 1) {
9942 if (PyUnicode_CheckExact(items[0])) {
9943 res = items[0];
9944 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009945 return res;
9946 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009947 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009948 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009949 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009950 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009951 /* Set up sep and seplen */
9952 if (separator == NULL) {
9953 /* fall back to a blank space separator */
9954 sep = PyUnicode_FromOrdinal(' ');
9955 if (!sep)
9956 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009957 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009958 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009959 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009960 else {
9961 if (!PyUnicode_Check(separator)) {
9962 PyErr_Format(PyExc_TypeError,
9963 "separator: expected str instance,"
9964 " %.80s found",
9965 Py_TYPE(separator)->tp_name);
9966 goto onError;
9967 }
9968 if (PyUnicode_READY(separator))
9969 goto onError;
9970 sep = separator;
9971 seplen = PyUnicode_GET_LENGTH(separator);
9972 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9973 /* inc refcount to keep this code path symmetric with the
9974 above case of a blank separator */
9975 Py_INCREF(sep);
9976 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009977 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009978 }
9979
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 /* There are at least two things to join, or else we have a subclass
9981 * of str in the sequence.
9982 * Do a pre-pass to figure out the total amount of space we'll
9983 * need (sz), and see whether all argument are strings.
9984 */
9985 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009986#ifdef Py_DEBUG
9987 use_memcpy = 0;
9988#else
9989 use_memcpy = 1;
9990#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009991 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009992 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009993 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 if (!PyUnicode_Check(item)) {
9995 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009996 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 " %.80s found",
9998 i, Py_TYPE(item)->tp_name);
9999 goto onError;
10000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 if (PyUnicode_READY(item) == -1)
10002 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010003 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010005 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010006 if (i != 0) {
10007 add_sz += seplen;
10008 }
10009 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010010 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010011 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 goto onError;
10013 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010014 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010015 if (use_memcpy && last_obj != NULL) {
10016 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10017 use_memcpy = 0;
10018 }
10019 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010020 }
Tim Petersced69f82003-09-16 20:30:58 +000010021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010023 if (res == NULL)
10024 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010025
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010026 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010027#ifdef Py_DEBUG
10028 use_memcpy = 0;
10029#else
10030 if (use_memcpy) {
10031 res_data = PyUnicode_1BYTE_DATA(res);
10032 kind = PyUnicode_KIND(res);
10033 if (seplen != 0)
10034 sep_data = PyUnicode_1BYTE_DATA(sep);
10035 }
10036#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010037 if (use_memcpy) {
10038 for (i = 0; i < seqlen; ++i) {
10039 Py_ssize_t itemlen;
10040 item = items[i];
10041
10042 /* Copy item, and maybe the separator. */
10043 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010044 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010046 kind * seplen);
10047 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010048 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010049
10050 itemlen = PyUnicode_GET_LENGTH(item);
10051 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010052 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010054 kind * itemlen);
10055 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010056 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010057 }
10058 assert(res_data == PyUnicode_1BYTE_DATA(res)
10059 + kind * PyUnicode_GET_LENGTH(res));
10060 }
10061 else {
10062 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10063 Py_ssize_t itemlen;
10064 item = items[i];
10065
10066 /* Copy item, and maybe the separator. */
10067 if (i && seplen != 0) {
10068 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10069 res_offset += seplen;
10070 }
10071
10072 itemlen = PyUnicode_GET_LENGTH(item);
10073 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010074 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 res_offset += itemlen;
10076 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010077 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010078 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010079 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010082 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084
Benjamin Peterson29060642009-01-31 22:14:21 +000010085 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010087 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088 return NULL;
10089}
10090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091#define FILL(kind, data, value, start, length) \
10092 do { \
10093 Py_ssize_t i_ = 0; \
10094 assert(kind != PyUnicode_WCHAR_KIND); \
10095 switch ((kind)) { \
10096 case PyUnicode_1BYTE_KIND: { \
10097 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010098 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 break; \
10100 } \
10101 case PyUnicode_2BYTE_KIND: { \
10102 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10103 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10104 break; \
10105 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010106 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10108 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10109 break; \
10110 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010111 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 } \
10113 } while (0)
10114
Victor Stinnerd3f08822012-05-29 12:57:52 +020010115void
10116_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10117 Py_UCS4 fill_char)
10118{
10119 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10120 const void *data = PyUnicode_DATA(unicode);
10121 assert(PyUnicode_IS_READY(unicode));
10122 assert(unicode_modifiable(unicode));
10123 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10124 assert(start >= 0);
10125 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10126 FILL(kind, data, fill_char, start, length);
10127}
10128
Victor Stinner3fe55312012-01-04 00:33:50 +010010129Py_ssize_t
10130PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10131 Py_UCS4 fill_char)
10132{
10133 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010134
10135 if (!PyUnicode_Check(unicode)) {
10136 PyErr_BadInternalCall();
10137 return -1;
10138 }
10139 if (PyUnicode_READY(unicode) == -1)
10140 return -1;
10141 if (unicode_check_modifiable(unicode))
10142 return -1;
10143
Victor Stinnerd3f08822012-05-29 12:57:52 +020010144 if (start < 0) {
10145 PyErr_SetString(PyExc_IndexError, "string index out of range");
10146 return -1;
10147 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010148 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10149 PyErr_SetString(PyExc_ValueError,
10150 "fill character is bigger than "
10151 "the string maximum character");
10152 return -1;
10153 }
10154
10155 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10156 length = Py_MIN(maxlen, length);
10157 if (length <= 0)
10158 return 0;
10159
Victor Stinnerd3f08822012-05-29 12:57:52 +020010160 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010161 return length;
10162}
10163
Victor Stinner9310abb2011-10-05 00:59:23 +020010164static PyObject *
10165pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010166 Py_ssize_t left,
10167 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 PyObject *u;
10171 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010172 int kind;
10173 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
10175 if (left < 0)
10176 left = 0;
10177 if (right < 0)
10178 right = 0;
10179
Victor Stinnerc4b49542011-12-11 22:44:26 +010010180 if (left == 0 && right == 0)
10181 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10184 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010185 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10186 return NULL;
10187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010189 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010191 if (!u)
10192 return NULL;
10193
10194 kind = PyUnicode_KIND(u);
10195 data = PyUnicode_DATA(u);
10196 if (left)
10197 FILL(kind, data, fill, 0, left);
10198 if (right)
10199 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010200 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010201 assert(_PyUnicode_CheckConsistency(u, 1));
10202 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203}
10204
Alexander Belopolsky40018472011-02-26 01:02:56 +000010205PyObject *
10206PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010210 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212
Benjamin Petersonead6b532011-12-20 17:23:42 -060010213 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010215 if (PyUnicode_IS_ASCII(string))
10216 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 PyUnicode_GET_LENGTH(string), keepends);
10219 else
10220 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010221 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 break;
10224 case PyUnicode_2BYTE_KIND:
10225 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010226 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 PyUnicode_GET_LENGTH(string), keepends);
10228 break;
10229 case PyUnicode_4BYTE_KIND:
10230 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010231 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 PyUnicode_GET_LENGTH(string), keepends);
10233 break;
10234 default:
10235 assert(0);
10236 list = 0;
10237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239}
10240
Alexander Belopolsky40018472011-02-26 01:02:56 +000010241static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010242split(PyObject *self,
10243 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010244 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010246 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 void *buf1, *buf2;
10248 Py_ssize_t len1, len2;
10249 PyObject* out;
10250
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010252 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 if (PyUnicode_READY(self) == -1)
10255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010258 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010260 if (PyUnicode_IS_ASCII(self))
10261 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010263 PyUnicode_GET_LENGTH(self), maxcount
10264 );
10265 else
10266 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 case PyUnicode_2BYTE_KIND:
10271 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 case PyUnicode_4BYTE_KIND:
10276 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010277 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 PyUnicode_GET_LENGTH(self), maxcount
10279 );
10280 default:
10281 assert(0);
10282 return NULL;
10283 }
10284
10285 if (PyUnicode_READY(substring) == -1)
10286 return NULL;
10287
10288 kind1 = PyUnicode_KIND(self);
10289 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 len1 = PyUnicode_GET_LENGTH(self);
10291 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010292 if (kind1 < kind2 || len1 < len2) {
10293 out = PyList_New(1);
10294 if (out == NULL)
10295 return NULL;
10296 Py_INCREF(self);
10297 PyList_SET_ITEM(out, 0, self);
10298 return out;
10299 }
10300 buf1 = PyUnicode_DATA(self);
10301 buf2 = PyUnicode_DATA(substring);
10302 if (kind2 != kind1) {
10303 buf2 = _PyUnicode_AsKind(substring, kind1);
10304 if (!buf2)
10305 return NULL;
10306 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010308 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010310 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10311 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010313 else
10314 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010315 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 break;
10317 case PyUnicode_2BYTE_KIND:
10318 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 break;
10321 case PyUnicode_4BYTE_KIND:
10322 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010323 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 break;
10325 default:
10326 out = NULL;
10327 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010328 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 PyMem_Free(buf2);
10330 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331}
10332
Alexander Belopolsky40018472011-02-26 01:02:56 +000010333static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010334rsplit(PyObject *self,
10335 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010336 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010337{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010338 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 void *buf1, *buf2;
10340 Py_ssize_t len1, len2;
10341 PyObject* out;
10342
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010343 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010344 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 if (PyUnicode_READY(self) == -1)
10347 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010350 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010352 if (PyUnicode_IS_ASCII(self))
10353 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010355 PyUnicode_GET_LENGTH(self), maxcount
10356 );
10357 else
10358 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010360 PyUnicode_GET_LENGTH(self), maxcount
10361 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 case PyUnicode_2BYTE_KIND:
10363 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010364 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 PyUnicode_GET_LENGTH(self), maxcount
10366 );
10367 case PyUnicode_4BYTE_KIND:
10368 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010369 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 PyUnicode_GET_LENGTH(self), maxcount
10371 );
10372 default:
10373 assert(0);
10374 return NULL;
10375 }
10376
10377 if (PyUnicode_READY(substring) == -1)
10378 return NULL;
10379
10380 kind1 = PyUnicode_KIND(self);
10381 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 len1 = PyUnicode_GET_LENGTH(self);
10383 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010384 if (kind1 < kind2 || len1 < len2) {
10385 out = PyList_New(1);
10386 if (out == NULL)
10387 return NULL;
10388 Py_INCREF(self);
10389 PyList_SET_ITEM(out, 0, self);
10390 return out;
10391 }
10392 buf1 = PyUnicode_DATA(self);
10393 buf2 = PyUnicode_DATA(substring);
10394 if (kind2 != kind1) {
10395 buf2 = _PyUnicode_AsKind(substring, kind1);
10396 if (!buf2)
10397 return NULL;
10398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010400 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010402 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10403 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010405 else
10406 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010407 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 case PyUnicode_2BYTE_KIND:
10410 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 break;
10413 case PyUnicode_4BYTE_KIND:
10414 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010415 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 break;
10417 default:
10418 out = NULL;
10419 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010420 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 PyMem_Free(buf2);
10422 return out;
10423}
10424
10425static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10427 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010429 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10432 return asciilib_find(buf1, len1, buf2, len2, offset);
10433 else
10434 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 case PyUnicode_2BYTE_KIND:
10436 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10437 case PyUnicode_4BYTE_KIND:
10438 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10439 }
10440 assert(0);
10441 return -1;
10442}
10443
10444static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010445anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10446 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010448 switch (kind) {
10449 case PyUnicode_1BYTE_KIND:
10450 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10451 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10452 else
10453 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10454 case PyUnicode_2BYTE_KIND:
10455 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10456 case PyUnicode_4BYTE_KIND:
10457 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10458 }
10459 assert(0);
10460 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010461}
10462
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010463static void
10464replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10465 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10466{
10467 int kind = PyUnicode_KIND(u);
10468 void *data = PyUnicode_DATA(u);
10469 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10470 if (kind == PyUnicode_1BYTE_KIND) {
10471 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10472 (Py_UCS1 *)data + len,
10473 u1, u2, maxcount);
10474 }
10475 else if (kind == PyUnicode_2BYTE_KIND) {
10476 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10477 (Py_UCS2 *)data + len,
10478 u1, u2, maxcount);
10479 }
10480 else {
10481 assert(kind == PyUnicode_4BYTE_KIND);
10482 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10483 (Py_UCS4 *)data + len,
10484 u1, u2, maxcount);
10485 }
10486}
10487
Alexander Belopolsky40018472011-02-26 01:02:56 +000010488static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489replace(PyObject *self, PyObject *str1,
10490 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 PyObject *u;
10493 char *sbuf = PyUnicode_DATA(self);
10494 char *buf1 = PyUnicode_DATA(str1);
10495 char *buf2 = PyUnicode_DATA(str2);
10496 int srelease = 0, release1 = 0, release2 = 0;
10497 int skind = PyUnicode_KIND(self);
10498 int kind1 = PyUnicode_KIND(str1);
10499 int kind2 = PyUnicode_KIND(str2);
10500 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10501 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10502 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010503 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010504 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
10506 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010509 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510
Victor Stinner59de0ee2011-10-07 10:01:28 +020010511 if (str1 == str2)
10512 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513
Victor Stinner49a0a212011-10-12 23:46:10 +020010514 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010515 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10516 if (maxchar < maxchar_str1)
10517 /* substring too wide to be present */
10518 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010519 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10520 /* Replacing str1 with str2 may cause a maxchar reduction in the
10521 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010522 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010523 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010528 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010531 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010532 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010533
Victor Stinner69ed0f42013-04-09 21:48:24 +020010534 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010535 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010536 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010537 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010538 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010540 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010542
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010543 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10544 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010545 }
10546 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 int rkind = skind;
10548 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010549 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (kind1 < rkind) {
10552 /* widen substring */
10553 buf1 = _PyUnicode_AsKind(str1, rkind);
10554 if (!buf1) goto error;
10555 release1 = 1;
10556 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010558 if (i < 0)
10559 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 if (rkind > kind2) {
10561 /* widen replacement */
10562 buf2 = _PyUnicode_AsKind(str2, rkind);
10563 if (!buf2) goto error;
10564 release2 = 1;
10565 }
10566 else if (rkind < kind2) {
10567 /* widen self and buf1 */
10568 rkind = kind2;
10569 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010570 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 sbuf = _PyUnicode_AsKind(self, rkind);
10572 if (!sbuf) goto error;
10573 srelease = 1;
10574 buf1 = _PyUnicode_AsKind(str1, rkind);
10575 if (!buf1) goto error;
10576 release1 = 1;
10577 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010578 u = PyUnicode_New(slen, maxchar);
10579 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 assert(PyUnicode_KIND(u) == rkind);
10582 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010583
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010584 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010585 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010590
10591 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010593 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010594 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010595 if (i == -1)
10596 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010597 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010599 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 }
10604 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010606 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 int rkind = skind;
10608 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010611 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 buf1 = _PyUnicode_AsKind(str1, rkind);
10613 if (!buf1) goto error;
10614 release1 = 1;
10615 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010616 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010617 if (n == 0)
10618 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 buf2 = _PyUnicode_AsKind(str2, rkind);
10622 if (!buf2) goto error;
10623 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010626 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 rkind = kind2;
10628 sbuf = _PyUnicode_AsKind(self, rkind);
10629 if (!sbuf) goto error;
10630 srelease = 1;
10631 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010632 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 buf1 = _PyUnicode_AsKind(str1, rkind);
10634 if (!buf1) goto error;
10635 release1 = 1;
10636 }
10637 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10638 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010639 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 PyErr_SetString(PyExc_OverflowError,
10641 "replace string is too long");
10642 goto error;
10643 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010644 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010645 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010646 _Py_INCREF_UNICODE_EMPTY();
10647 if (!unicode_empty)
10648 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010649 u = unicode_empty;
10650 goto done;
10651 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010652 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 PyErr_SetString(PyExc_OverflowError,
10654 "replace string is too long");
10655 goto error;
10656 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 u = PyUnicode_New(new_size, maxchar);
10658 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010660 assert(PyUnicode_KIND(u) == rkind);
10661 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 ires = i = 0;
10663 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 while (n-- > 0) {
10665 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010666 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010668 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010669 if (j == -1)
10670 break;
10671 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
10674 sbuf + rkind * i,
10675 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 }
10678 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010680 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010689 memcpy(res + rkind * ires,
10690 sbuf + rkind * i,
10691 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010692 }
10693 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 /* interleave */
10695 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010696 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010698 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700 if (--n <= 0)
10701 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010702 memcpy(res + rkind * ires,
10703 sbuf + rkind * i,
10704 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 ires++;
10706 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010707 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010708 memcpy(res + rkind * ires,
10709 sbuf + rkind * i,
10710 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010711 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010712 }
10713
10714 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010715 unicode_adjust_maxchar(&u);
10716 if (u == NULL)
10717 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010719
10720 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (srelease)
10722 PyMem_FREE(sbuf);
10723 if (release1)
10724 PyMem_FREE(buf1);
10725 if (release2)
10726 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010727 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010729
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010731 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 if (srelease)
10733 PyMem_FREE(sbuf);
10734 if (release1)
10735 PyMem_FREE(buf1);
10736 if (release2)
10737 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010738 return unicode_result_unchanged(self);
10739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 error:
10741 if (srelease && sbuf)
10742 PyMem_FREE(sbuf);
10743 if (release1 && buf1)
10744 PyMem_FREE(buf1);
10745 if (release2 && buf2)
10746 PyMem_FREE(buf2);
10747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748}
10749
10750/* --- Unicode Object Methods --------------------------------------------- */
10751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754\n\
10755Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010756characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
10758static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010759unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010761 if (PyUnicode_READY(self) == -1)
10762 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010763 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764}
10765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010766PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768\n\
10769Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010770have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771
10772static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010773unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010775 if (PyUnicode_READY(self) == -1)
10776 return NULL;
10777 if (PyUnicode_GET_LENGTH(self) == 0)
10778 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010779 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780}
10781
Benjamin Petersond5890c82012-01-14 13:23:30 -050010782PyDoc_STRVAR(casefold__doc__,
10783 "S.casefold() -> str\n\
10784\n\
10785Return a version of S suitable for caseless comparisons.");
10786
10787static PyObject *
10788unicode_casefold(PyObject *self)
10789{
10790 if (PyUnicode_READY(self) == -1)
10791 return NULL;
10792 if (PyUnicode_IS_ASCII(self))
10793 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010794 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010795}
10796
10797
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010798/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010799
10800static int
10801convert_uc(PyObject *obj, void *addr)
10802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010804
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010805 if (!PyUnicode_Check(obj)) {
10806 PyErr_Format(PyExc_TypeError,
10807 "The fill character must be a unicode character, "
10808 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010809 return 0;
10810 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010811 if (PyUnicode_READY(obj) < 0)
10812 return 0;
10813 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010814 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010815 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 return 0;
10817 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010818 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010819 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010820}
10821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010822PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010823 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010825Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010826done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
10828static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010829unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010831 Py_ssize_t marg, left;
10832 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 Py_UCS4 fillchar = ' ';
10834
Victor Stinnere9a29352011-10-01 02:14:59 +020010835 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837
Benjamin Petersonbac79492012-01-14 13:34:47 -050010838 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839 return NULL;
10840
Victor Stinnerc4b49542011-12-11 22:44:26 +010010841 if (PyUnicode_GET_LENGTH(self) >= width)
10842 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
Victor Stinnerc4b49542011-12-11 22:44:26 +010010844 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 left = marg / 2 + (marg & width & 1);
10846
Victor Stinner9310abb2011-10-05 00:59:23 +020010847 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848}
10849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850/* This function assumes that str1 and str2 are readied by the caller. */
10851
Marc-André Lemburge5034372000-08-08 08:04:29 +000010852static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010853unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010854{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010855#define COMPARE(TYPE1, TYPE2) \
10856 do { \
10857 TYPE1* p1 = (TYPE1 *)data1; \
10858 TYPE2* p2 = (TYPE2 *)data2; \
10859 TYPE1* end = p1 + len; \
10860 Py_UCS4 c1, c2; \
10861 for (; p1 != end; p1++, p2++) { \
10862 c1 = *p1; \
10863 c2 = *p2; \
10864 if (c1 != c2) \
10865 return (c1 < c2) ? -1 : 1; \
10866 } \
10867 } \
10868 while (0)
10869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 int kind1, kind2;
10871 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010872 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 kind1 = PyUnicode_KIND(str1);
10875 kind2 = PyUnicode_KIND(str2);
10876 data1 = PyUnicode_DATA(str1);
10877 data2 = PyUnicode_DATA(str2);
10878 len1 = PyUnicode_GET_LENGTH(str1);
10879 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010880 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010881
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882 switch(kind1) {
10883 case PyUnicode_1BYTE_KIND:
10884 {
10885 switch(kind2) {
10886 case PyUnicode_1BYTE_KIND:
10887 {
10888 int cmp = memcmp(data1, data2, len);
10889 /* normalize result of memcmp() into the range [-1; 1] */
10890 if (cmp < 0)
10891 return -1;
10892 if (cmp > 0)
10893 return 1;
10894 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010895 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010896 case PyUnicode_2BYTE_KIND:
10897 COMPARE(Py_UCS1, Py_UCS2);
10898 break;
10899 case PyUnicode_4BYTE_KIND:
10900 COMPARE(Py_UCS1, Py_UCS4);
10901 break;
10902 default:
10903 assert(0);
10904 }
10905 break;
10906 }
10907 case PyUnicode_2BYTE_KIND:
10908 {
10909 switch(kind2) {
10910 case PyUnicode_1BYTE_KIND:
10911 COMPARE(Py_UCS2, Py_UCS1);
10912 break;
10913 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010914 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010915 COMPARE(Py_UCS2, Py_UCS2);
10916 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010917 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010918 case PyUnicode_4BYTE_KIND:
10919 COMPARE(Py_UCS2, Py_UCS4);
10920 break;
10921 default:
10922 assert(0);
10923 }
10924 break;
10925 }
10926 case PyUnicode_4BYTE_KIND:
10927 {
10928 switch(kind2) {
10929 case PyUnicode_1BYTE_KIND:
10930 COMPARE(Py_UCS4, Py_UCS1);
10931 break;
10932 case PyUnicode_2BYTE_KIND:
10933 COMPARE(Py_UCS4, Py_UCS2);
10934 break;
10935 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010936 {
10937#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10938 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10939 /* normalize result of wmemcmp() into the range [-1; 1] */
10940 if (cmp < 0)
10941 return -1;
10942 if (cmp > 0)
10943 return 1;
10944#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010945 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010946#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010947 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010948 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010949 default:
10950 assert(0);
10951 }
10952 break;
10953 }
10954 default:
10955 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010956 }
10957
Victor Stinner770e19e2012-10-04 22:59:45 +020010958 if (len1 == len2)
10959 return 0;
10960 if (len1 < len2)
10961 return -1;
10962 else
10963 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010964
10965#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010966}
10967
Benjamin Peterson621b4302016-09-09 13:54:34 -070010968static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010969unicode_compare_eq(PyObject *str1, PyObject *str2)
10970{
10971 int kind;
10972 void *data1, *data2;
10973 Py_ssize_t len;
10974 int cmp;
10975
Victor Stinnere5567ad2012-10-23 02:48:49 +020010976 len = PyUnicode_GET_LENGTH(str1);
10977 if (PyUnicode_GET_LENGTH(str2) != len)
10978 return 0;
10979 kind = PyUnicode_KIND(str1);
10980 if (PyUnicode_KIND(str2) != kind)
10981 return 0;
10982 data1 = PyUnicode_DATA(str1);
10983 data2 = PyUnicode_DATA(str2);
10984
10985 cmp = memcmp(data1, data2, len * kind);
10986 return (cmp == 0);
10987}
10988
10989
Alexander Belopolsky40018472011-02-26 01:02:56 +000010990int
10991PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10994 if (PyUnicode_READY(left) == -1 ||
10995 PyUnicode_READY(right) == -1)
10996 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010997
10998 /* a string is equal to itself */
10999 if (left == right)
11000 return 0;
11001
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011002 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011004 PyErr_Format(PyExc_TypeError,
11005 "Can't compare %.100s and %.100s",
11006 left->ob_type->tp_name,
11007 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 return -1;
11009}
11010
Martin v. Löwis5b222132007-06-10 09:51:05 +000011011int
11012PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 Py_ssize_t i;
11015 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011017 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018
Victor Stinner910337b2011-10-03 03:20:16 +020011019 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011020 if (!PyUnicode_IS_READY(uni)) {
11021 const wchar_t *ws = _PyUnicode_WSTR(uni);
11022 /* Compare Unicode string and source character set string */
11023 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11024 if (chr != ustr[i])
11025 return (chr < ustr[i]) ? -1 : 1;
11026 }
11027 /* This check keeps Python strings that end in '\0' from comparing equal
11028 to C strings identical up to that point. */
11029 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11030 return 1; /* uni is longer */
11031 if (ustr[i])
11032 return -1; /* str is longer */
11033 return 0;
11034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011036 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011037 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011038 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011039 size_t len, len2 = strlen(str);
11040 int cmp;
11041
11042 len = Py_MIN(len1, len2);
11043 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011044 if (cmp != 0) {
11045 if (cmp < 0)
11046 return -1;
11047 else
11048 return 1;
11049 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011050 if (len1 > len2)
11051 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011052 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011053 return -1; /* str is longer */
11054 return 0;
11055 }
11056 else {
11057 void *data = PyUnicode_DATA(uni);
11058 /* Compare Unicode string and source character set string */
11059 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011060 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011061 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11062 /* This check keeps Python strings that end in '\0' from comparing equal
11063 to C strings identical up to that point. */
11064 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11065 return 1; /* uni is longer */
11066 if (str[i])
11067 return -1; /* str is longer */
11068 return 0;
11069 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011070}
11071
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011072static int
11073non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11074{
11075 size_t i, len;
11076 const wchar_t *p;
11077 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11078 if (strlen(str) != len)
11079 return 0;
11080 p = _PyUnicode_WSTR(unicode);
11081 assert(p);
11082 for (i = 0; i < len; i++) {
11083 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011084 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011085 return 0;
11086 }
11087 return 1;
11088}
11089
11090int
11091_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11092{
11093 size_t len;
11094 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011095 assert(str);
11096#ifndef NDEBUG
11097 for (const char *p = str; *p; p++) {
11098 assert((unsigned char)*p < 128);
11099 }
11100#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011101 if (PyUnicode_READY(unicode) == -1) {
11102 /* Memory error or bad data */
11103 PyErr_Clear();
11104 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11105 }
11106 if (!PyUnicode_IS_ASCII(unicode))
11107 return 0;
11108 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11109 return strlen(str) == len &&
11110 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11111}
11112
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011113int
11114_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11115{
11116 PyObject *right_uni;
11117 Py_hash_t hash;
11118
11119 assert(_PyUnicode_CHECK(left));
11120 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011121#ifndef NDEBUG
11122 for (const char *p = right->string; *p; p++) {
11123 assert((unsigned char)*p < 128);
11124 }
11125#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011126
11127 if (PyUnicode_READY(left) == -1) {
11128 /* memory error or bad data */
11129 PyErr_Clear();
11130 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11131 }
11132
11133 if (!PyUnicode_IS_ASCII(left))
11134 return 0;
11135
11136 right_uni = _PyUnicode_FromId(right); /* borrowed */
11137 if (right_uni == NULL) {
11138 /* memory error or bad data */
11139 PyErr_Clear();
11140 return _PyUnicode_EqualToASCIIString(left, right->string);
11141 }
11142
11143 if (left == right_uni)
11144 return 1;
11145
11146 if (PyUnicode_CHECK_INTERNED(left))
11147 return 0;
11148
11149 assert(_PyUnicode_HASH(right_uni) != 1);
11150 hash = _PyUnicode_HASH(left);
11151 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11152 return 0;
11153
11154 return unicode_compare_eq(left, right_uni);
11155}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011156
Benjamin Peterson29060642009-01-31 22:14:21 +000011157#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011158 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011159
Alexander Belopolsky40018472011-02-26 01:02:56 +000011160PyObject *
11161PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011162{
11163 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011164 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011165
Victor Stinnere5567ad2012-10-23 02:48:49 +020011166 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11167 Py_RETURN_NOTIMPLEMENTED;
11168
11169 if (PyUnicode_READY(left) == -1 ||
11170 PyUnicode_READY(right) == -1)
11171 return NULL;
11172
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011173 if (left == right) {
11174 switch (op) {
11175 case Py_EQ:
11176 case Py_LE:
11177 case Py_GE:
11178 /* a string is equal to itself */
11179 v = Py_True;
11180 break;
11181 case Py_NE:
11182 case Py_LT:
11183 case Py_GT:
11184 v = Py_False;
11185 break;
11186 default:
11187 PyErr_BadArgument();
11188 return NULL;
11189 }
11190 }
11191 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011192 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011193 result ^= (op == Py_NE);
11194 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011195 }
11196 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011197 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011198
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011199 /* Convert the return value to a Boolean */
11200 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011201 case Py_LE:
11202 v = TEST_COND(result <= 0);
11203 break;
11204 case Py_GE:
11205 v = TEST_COND(result >= 0);
11206 break;
11207 case Py_LT:
11208 v = TEST_COND(result == -1);
11209 break;
11210 case Py_GT:
11211 v = TEST_COND(result == 1);
11212 break;
11213 default:
11214 PyErr_BadArgument();
11215 return NULL;
11216 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011217 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011218 Py_INCREF(v);
11219 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011220}
11221
Alexander Belopolsky40018472011-02-26 01:02:56 +000011222int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011223_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11224{
11225 return unicode_eq(aa, bb);
11226}
11227
11228int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011230{
Victor Stinner77282cb2013-04-14 19:22:47 +020011231 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 void *buf1, *buf2;
11233 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011234 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011235
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011238 "'in <string>' requires string as left operand, not %.100s",
11239 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011240 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011241 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011243 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011244 if (ensure_unicode(str) < 0)
11245 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011248 kind2 = PyUnicode_KIND(substr);
11249 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011250 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011252 len2 = PyUnicode_GET_LENGTH(substr);
11253 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011254 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011255 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011256 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011257 if (len2 == 1) {
11258 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11259 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011260 return result;
11261 }
11262 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011263 buf2 = _PyUnicode_AsKind(substr, kind1);
11264 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011265 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267
Victor Stinner77282cb2013-04-14 19:22:47 +020011268 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 case PyUnicode_1BYTE_KIND:
11270 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11271 break;
11272 case PyUnicode_2BYTE_KIND:
11273 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11274 break;
11275 case PyUnicode_4BYTE_KIND:
11276 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11277 break;
11278 default:
11279 result = -1;
11280 assert(0);
11281 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011282
Victor Stinner77282cb2013-04-14 19:22:47 +020011283 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 PyMem_Free(buf2);
11285
Guido van Rossum403d68b2000-03-13 15:55:09 +000011286 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011287}
11288
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289/* Concat to string or Unicode object giving a new Unicode object. */
11290
Alexander Belopolsky40018472011-02-26 01:02:56 +000011291PyObject *
11292PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011295 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011298 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
11301 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 if (left == unicode_empty)
11303 return PyUnicode_FromObject(right);
11304 if (right == unicode_empty)
11305 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011307 left_len = PyUnicode_GET_LENGTH(left);
11308 right_len = PyUnicode_GET_LENGTH(right);
11309 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011310 PyErr_SetString(PyExc_OverflowError,
11311 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011312 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011313 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011314 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011315
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011316 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11317 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011318 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011321 result = PyUnicode_New(new_len, maxchar);
11322 if (result == NULL)
11323 return NULL;
11324 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11325 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11326 assert(_PyUnicode_CheckConsistency(result, 1));
11327 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328}
11329
Walter Dörwald1ab83302007-05-18 17:15:44 +000011330void
Victor Stinner23e56682011-10-03 03:54:37 +020011331PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011332{
Victor Stinner23e56682011-10-03 03:54:37 +020011333 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011334 Py_UCS4 maxchar, maxchar2;
11335 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011336
11337 if (p_left == NULL) {
11338 if (!PyErr_Occurred())
11339 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011340 return;
11341 }
Victor Stinner23e56682011-10-03 03:54:37 +020011342 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011343 if (right == NULL || left == NULL
11344 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011345 if (!PyErr_Occurred())
11346 PyErr_BadInternalCall();
11347 goto error;
11348 }
11349
Benjamin Petersonbac79492012-01-14 13:34:47 -050011350 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011351 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011352 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011353 goto error;
11354
Victor Stinner488fa492011-12-12 00:01:39 +010011355 /* Shortcuts */
11356 if (left == unicode_empty) {
11357 Py_DECREF(left);
11358 Py_INCREF(right);
11359 *p_left = right;
11360 return;
11361 }
11362 if (right == unicode_empty)
11363 return;
11364
11365 left_len = PyUnicode_GET_LENGTH(left);
11366 right_len = PyUnicode_GET_LENGTH(right);
11367 if (left_len > PY_SSIZE_T_MAX - right_len) {
11368 PyErr_SetString(PyExc_OverflowError,
11369 "strings are too large to concat");
11370 goto error;
11371 }
11372 new_len = left_len + right_len;
11373
11374 if (unicode_modifiable(left)
11375 && PyUnicode_CheckExact(right)
11376 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011377 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11378 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011379 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011380 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011381 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11382 {
11383 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011384 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011385 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011386
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011387 /* copy 'right' into the newly allocated area of 'left' */
11388 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011389 }
Victor Stinner488fa492011-12-12 00:01:39 +010011390 else {
11391 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11392 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011393 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011394
Victor Stinner488fa492011-12-12 00:01:39 +010011395 /* Concat the two Unicode strings */
11396 res = PyUnicode_New(new_len, maxchar);
11397 if (res == NULL)
11398 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011399 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11400 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011401 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011402 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011403 }
11404 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011405 return;
11406
11407error:
Victor Stinner488fa492011-12-12 00:01:39 +010011408 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011409}
11410
11411void
11412PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11413{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011414 PyUnicode_Append(pleft, right);
11415 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011416}
11417
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011418/*
11419Wraps stringlib_parse_args_finds() and additionally ensures that the
11420first argument is a unicode object.
11421*/
11422
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011423static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011424parse_args_finds_unicode(const char * function_name, PyObject *args,
11425 PyObject **substring,
11426 Py_ssize_t *start, Py_ssize_t *end)
11427{
11428 if(stringlib_parse_args_finds(function_name, args, substring,
11429 start, end)) {
11430 if (ensure_unicode(*substring) < 0)
11431 return 0;
11432 return 1;
11433 }
11434 return 0;
11435}
11436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011440Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011441string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011445unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011447 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011448 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011449 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011451 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 void *buf1, *buf2;
11453 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011455 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 kind1 = PyUnicode_KIND(self);
11459 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011461 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 len1 = PyUnicode_GET_LENGTH(self);
11464 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011466 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011467 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011468
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011469 buf1 = PyUnicode_DATA(self);
11470 buf2 = PyUnicode_DATA(substring);
11471 if (kind2 != kind1) {
11472 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011473 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011474 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011475 }
11476 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 case PyUnicode_1BYTE_KIND:
11478 iresult = ucs1lib_count(
11479 ((Py_UCS1*)buf1) + start, end - start,
11480 buf2, len2, PY_SSIZE_T_MAX
11481 );
11482 break;
11483 case PyUnicode_2BYTE_KIND:
11484 iresult = ucs2lib_count(
11485 ((Py_UCS2*)buf1) + start, end - start,
11486 buf2, len2, PY_SSIZE_T_MAX
11487 );
11488 break;
11489 case PyUnicode_4BYTE_KIND:
11490 iresult = ucs4lib_count(
11491 ((Py_UCS4*)buf1) + start, end - start,
11492 buf2, len2, PY_SSIZE_T_MAX
11493 );
11494 break;
11495 default:
11496 assert(0); iresult = 0;
11497 }
11498
11499 result = PyLong_FromSsize_t(iresult);
11500
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011501 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 return result;
11505}
11506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011507PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011508 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011510Encode S using the codec registered for encoding. Default encoding\n\
11511is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011512handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011513a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11514'xmlcharrefreplace' as well as any other name registered with\n\
11515codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
11517static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011518unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011520 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521 char *encoding = NULL;
11522 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011523
Benjamin Peterson308d6372009-09-18 21:42:35 +000011524 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11525 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011527 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011528}
11529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011530PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011531 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532\n\
11533Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011534If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535
11536static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011537unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011539 Py_ssize_t i, j, line_pos, src_len, incr;
11540 Py_UCS4 ch;
11541 PyObject *u;
11542 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011543 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011545 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011546 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Ezio Melotti745d54d2013-11-16 19:10:57 +020011548 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11549 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
Antoine Pitrou22425222011-10-04 19:10:51 +020011552 if (PyUnicode_READY(self) == -1)
11553 return NULL;
11554
Thomas Wouters7e474022000-07-16 12:04:32 +000011555 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011556 src_len = PyUnicode_GET_LENGTH(self);
11557 i = j = line_pos = 0;
11558 kind = PyUnicode_KIND(self);
11559 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011560 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 for (; i < src_len; i++) {
11562 ch = PyUnicode_READ(kind, src_data, i);
11563 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011564 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011566 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 goto overflow;
11569 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011571 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011575 goto overflow;
11576 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011578 if (ch == '\n' || ch == '\r')
11579 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011581 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011582 if (!found)
11583 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011584
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011586 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587 if (!u)
11588 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011589 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
Antoine Pitroue71d5742011-10-04 15:55:09 +020011591 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
Antoine Pitroue71d5742011-10-04 15:55:09 +020011593 for (; i < src_len; i++) {
11594 ch = PyUnicode_READ(kind, src_data, i);
11595 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 incr = tabsize - (line_pos % tabsize);
11598 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011599 FILL(kind, dest_data, ' ', j, incr);
11600 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011602 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 line_pos++;
11605 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011606 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011607 if (ch == '\n' || ch == '\r')
11608 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011610 }
11611 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011612 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011613
Antoine Pitroue71d5742011-10-04 15:55:09 +020011614 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011615 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617}
11618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011619PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621\n\
11622Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011623such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624arguments start and end are interpreted as in slice notation.\n\
11625\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011626Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
11628static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011631 /* initialize variables to prevent gcc warning */
11632 PyObject *substring = NULL;
11633 Py_ssize_t start = 0;
11634 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011635 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011637 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011640 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011643 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 if (result == -2)
11646 return NULL;
11647
Christian Heimes217cfd12007-12-02 14:31:20 +000011648 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649}
11650
11651static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011652unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011654 void *data;
11655 enum PyUnicode_Kind kind;
11656 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011657
11658 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11659 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011661 }
11662 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11663 PyErr_SetString(PyExc_IndexError, "string index out of range");
11664 return NULL;
11665 }
11666 kind = PyUnicode_KIND(self);
11667 data = PyUnicode_DATA(self);
11668 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011669 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670}
11671
Guido van Rossumc2504932007-09-18 19:42:40 +000011672/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011673 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011674static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011675unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676{
Guido van Rossumc2504932007-09-18 19:42:40 +000011677 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011678 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011679
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011680#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011681 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011682#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 if (_PyUnicode_HASH(self) != -1)
11684 return _PyUnicode_HASH(self);
11685 if (PyUnicode_READY(self) == -1)
11686 return -1;
11687 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011688 /*
11689 We make the hash of the empty string be 0, rather than using
11690 (prefix ^ suffix), since this slightly obfuscates the hash secret
11691 */
11692 if (len == 0) {
11693 _PyUnicode_HASH(self) = 0;
11694 return 0;
11695 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011696 x = _Py_HashBytes(PyUnicode_DATA(self),
11697 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011699 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700}
11701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011702PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704\n\
Mariatta577fc042017-04-09 15:17:06 -070011705Return the lowest index in S where substring sub is found, \n\
11706such that sub is contained within S[start:end]. Optional\n\
11707arguments start and end are interpreted as in slice notation.\n\
11708\n\
11709Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710
11711static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011714 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011715 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011716 PyObject *substring = NULL;
11717 Py_ssize_t start = 0;
11718 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011720 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011723 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011726 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (result == -2)
11729 return NULL;
11730
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 if (result < 0) {
11732 PyErr_SetString(PyExc_ValueError, "substring not found");
11733 return NULL;
11734 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011735
Christian Heimes217cfd12007-12-02 14:31:20 +000011736 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737}
11738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011739PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011742Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011743at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
11745static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011746unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 Py_ssize_t i, length;
11749 int kind;
11750 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 int cased;
11752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (PyUnicode_READY(self) == -1)
11754 return NULL;
11755 length = PyUnicode_GET_LENGTH(self);
11756 kind = PyUnicode_KIND(self);
11757 data = PyUnicode_DATA(self);
11758
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 if (length == 1)
11761 return PyBool_FromLong(
11762 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011764 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011767
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 for (i = 0; i < length; i++) {
11770 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011771
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11773 return PyBool_FromLong(0);
11774 else if (!cased && Py_UNICODE_ISLOWER(ch))
11775 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011777 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778}
11779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011783Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011784at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
11786static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011787unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t i, length;
11790 int kind;
11791 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 int cased;
11793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (PyUnicode_READY(self) == -1)
11795 return NULL;
11796 length = PyUnicode_GET_LENGTH(self);
11797 kind = PyUnicode_KIND(self);
11798 data = PyUnicode_DATA(self);
11799
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 if (length == 1)
11802 return PyBool_FromLong(
11803 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011805 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011808
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 for (i = 0; i < length; i++) {
11811 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011812
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11814 return PyBool_FromLong(0);
11815 else if (!cased && Py_UNICODE_ISUPPER(ch))
11816 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011818 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819}
11820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011821PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011824Return True if S is a titlecased string and there is at least one\n\
11825character in S, i.e. upper- and titlecase characters may only\n\
11826follow uncased characters and lowercase characters only cased ones.\n\
11827Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
11829static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011830unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 Py_ssize_t i, length;
11833 int kind;
11834 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 int cased, previous_is_cased;
11836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (PyUnicode_READY(self) == -1)
11838 return NULL;
11839 length = PyUnicode_GET_LENGTH(self);
11840 kind = PyUnicode_KIND(self);
11841 data = PyUnicode_DATA(self);
11842
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (length == 1) {
11845 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11846 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11847 (Py_UNICODE_ISUPPER(ch) != 0));
11848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011850 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011853
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 cased = 0;
11855 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 for (i = 0; i < length; i++) {
11857 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011858
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11860 if (previous_is_cased)
11861 return PyBool_FromLong(0);
11862 previous_is_cased = 1;
11863 cased = 1;
11864 }
11865 else if (Py_UNICODE_ISLOWER(ch)) {
11866 if (!previous_is_cased)
11867 return PyBool_FromLong(0);
11868 previous_is_cased = 1;
11869 cased = 1;
11870 }
11871 else
11872 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011874 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875}
11876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011877PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011880Return True if all characters in S are whitespace\n\
11881and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882
11883static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011884unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 Py_ssize_t i, length;
11887 int kind;
11888 void *data;
11889
11890 if (PyUnicode_READY(self) == -1)
11891 return NULL;
11892 length = PyUnicode_GET_LENGTH(self);
11893 kind = PyUnicode_KIND(self);
11894 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (length == 1)
11898 return PyBool_FromLong(
11899 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011901 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 for (i = 0; i < length; i++) {
11906 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011907 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011910 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911}
11912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011913PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011915\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011916Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011917and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011918
11919static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011920unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 Py_ssize_t i, length;
11923 int kind;
11924 void *data;
11925
11926 if (PyUnicode_READY(self) == -1)
11927 return NULL;
11928 length = PyUnicode_GET_LENGTH(self);
11929 kind = PyUnicode_KIND(self);
11930 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011931
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011932 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 1)
11934 return PyBool_FromLong(
11935 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011936
11937 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 for (i = 0; i < length; i++) {
11942 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011944 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011945 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011946}
11947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011948PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011950\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011951Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011952and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011953
11954static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011955unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 int kind;
11958 void *data;
11959 Py_ssize_t len, i;
11960
11961 if (PyUnicode_READY(self) == -1)
11962 return NULL;
11963
11964 kind = PyUnicode_KIND(self);
11965 data = PyUnicode_DATA(self);
11966 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011967
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011968 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 if (len == 1) {
11970 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11971 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11972 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011973
11974 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 for (i = 0; i < len; i++) {
11979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011980 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011982 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011983 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011984}
11985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011986PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011989Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011990False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
11992static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011993unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 Py_ssize_t i, length;
11996 int kind;
11997 void *data;
11998
11999 if (PyUnicode_READY(self) == -1)
12000 return NULL;
12001 length = PyUnicode_GET_LENGTH(self);
12002 kind = PyUnicode_KIND(self);
12003 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (length == 1)
12007 return PyBool_FromLong(
12008 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012010 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 for (i = 0; i < length; i++) {
12015 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012018 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019}
12020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012021PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000012024Return True if all characters in S are digits\n\
12025and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
12027static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012028unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 Py_ssize_t i, length;
12031 int kind;
12032 void *data;
12033
12034 if (PyUnicode_READY(self) == -1)
12035 return NULL;
12036 length = PyUnicode_GET_LENGTH(self);
12037 kind = PyUnicode_KIND(self);
12038 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 if (length == 1) {
12042 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12043 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012046 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 for (i = 0; i < length; i++) {
12051 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012054 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055}
12056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012057PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000012060Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012061False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
12063static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012064unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 Py_ssize_t i, length;
12067 int kind;
12068 void *data;
12069
12070 if (PyUnicode_READY(self) == -1)
12071 return NULL;
12072 length = PyUnicode_GET_LENGTH(self);
12073 kind = PyUnicode_KIND(self);
12074 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 if (length == 1)
12078 return PyBool_FromLong(
12079 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012081 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 for (i = 0; i < length; i++) {
12086 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012089 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090}
12091
Martin v. Löwis47383402007-08-15 07:32:56 +000012092int
12093PyUnicode_IsIdentifier(PyObject *self)
12094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 int kind;
12096 void *data;
12097 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012098 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (PyUnicode_READY(self) == -1) {
12101 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 }
12104
12105 /* Special case for empty strings */
12106 if (PyUnicode_GET_LENGTH(self) == 0)
12107 return 0;
12108 kind = PyUnicode_KIND(self);
12109 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012110
12111 /* PEP 3131 says that the first character must be in
12112 XID_Start and subsequent characters in XID_Continue,
12113 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012114 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012115 letters, digits, underscore). However, given the current
12116 definition of XID_Start and XID_Continue, it is sufficient
12117 to check just for these, except that _ must be allowed
12118 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012120 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012121 return 0;
12122
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012123 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012126 return 1;
12127}
12128
12129PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012131\n\
12132Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012133to the language definition.\n\
12134\n\
12135Use keyword.iskeyword() to test for reserved identifiers\n\
12136such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012137
12138static PyObject*
12139unicode_isidentifier(PyObject *self)
12140{
12141 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12142}
12143
Georg Brandl559e5d72008-06-11 18:37:52 +000012144PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012146\n\
12147Return True if all characters in S are considered\n\
12148printable in repr() or S is empty, False otherwise.");
12149
12150static PyObject*
12151unicode_isprintable(PyObject *self)
12152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 Py_ssize_t i, length;
12154 int kind;
12155 void *data;
12156
12157 if (PyUnicode_READY(self) == -1)
12158 return NULL;
12159 length = PyUnicode_GET_LENGTH(self);
12160 kind = PyUnicode_KIND(self);
12161 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012162
12163 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (length == 1)
12165 return PyBool_FromLong(
12166 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 for (i = 0; i < length; i++) {
12169 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012170 Py_RETURN_FALSE;
12171 }
12172 }
12173 Py_RETURN_TRUE;
12174}
12175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012176PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012177 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178\n\
12179Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012180iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
12182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012183unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012185 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186}
12187
Martin v. Löwis18e16552006-02-15 17:27:45 +000012188static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012189unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (PyUnicode_READY(self) == -1)
12192 return -1;
12193 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194}
12195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012196PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012197 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012199Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012200done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
12202static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012203unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012205 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 Py_UCS4 fillchar = ' ';
12207
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012208 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209 return NULL;
12210
Benjamin Petersonbac79492012-01-14 13:34:47 -050012211 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213
Victor Stinnerc4b49542011-12-11 22:44:26 +010012214 if (PyUnicode_GET_LENGTH(self) >= width)
12215 return unicode_result_unchanged(self);
12216
12217 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218}
12219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012220PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012223Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
12225static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012226unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012228 if (PyUnicode_READY(self) == -1)
12229 return NULL;
12230 if (PyUnicode_IS_ASCII(self))
12231 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012232 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012235#define LEFTSTRIP 0
12236#define RIGHTSTRIP 1
12237#define BOTHSTRIP 2
12238
12239/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012240static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012241
12242#define STRIPNAME(i) (stripformat[i]+3)
12243
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012244/* externally visible for str.strip(unicode) */
12245PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012246_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 void *data;
12249 int kind;
12250 Py_ssize_t i, j, len;
12251 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012252 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12255 return NULL;
12256
12257 kind = PyUnicode_KIND(self);
12258 data = PyUnicode_DATA(self);
12259 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012260 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12262 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012263 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012264
Benjamin Peterson14339b62009-01-31 16:36:08 +000012265 i = 0;
12266 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012267 while (i < len) {
12268 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12269 if (!BLOOM(sepmask, ch))
12270 break;
12271 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12272 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 i++;
12274 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012275 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012276
Benjamin Peterson14339b62009-01-31 16:36:08 +000012277 j = len;
12278 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012279 j--;
12280 while (j >= i) {
12281 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12282 if (!BLOOM(sepmask, ch))
12283 break;
12284 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12285 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012287 }
12288
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012290 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291
Victor Stinner7931d9a2011-11-04 00:22:48 +010012292 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293}
12294
12295PyObject*
12296PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12297{
12298 unsigned char *data;
12299 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012300 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301
Victor Stinnerde636f32011-10-01 03:55:54 +020012302 if (PyUnicode_READY(self) == -1)
12303 return NULL;
12304
Victor Stinner684d5fd2012-05-03 02:32:34 +020012305 length = PyUnicode_GET_LENGTH(self);
12306 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012307
Victor Stinner684d5fd2012-05-03 02:32:34 +020012308 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012309 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310
Victor Stinnerde636f32011-10-01 03:55:54 +020012311 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012312 PyErr_SetString(PyExc_IndexError, "string index out of range");
12313 return NULL;
12314 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012315 if (start >= length || end < start)
12316 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012317
Victor Stinner684d5fd2012-05-03 02:32:34 +020012318 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012319 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012320 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012321 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012322 }
12323 else {
12324 kind = PyUnicode_KIND(self);
12325 data = PyUnicode_1BYTE_DATA(self);
12326 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012327 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012328 length);
12329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331
12332static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012333do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 Py_ssize_t len, i, j;
12336
12337 if (PyUnicode_READY(self) == -1)
12338 return NULL;
12339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012341
Victor Stinnercc7af722013-04-09 22:39:24 +020012342 if (PyUnicode_IS_ASCII(self)) {
12343 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12344
12345 i = 0;
12346 if (striptype != RIGHTSTRIP) {
12347 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012348 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012349 if (!_Py_ascii_whitespace[ch])
12350 break;
12351 i++;
12352 }
12353 }
12354
12355 j = len;
12356 if (striptype != LEFTSTRIP) {
12357 j--;
12358 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012359 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012360 if (!_Py_ascii_whitespace[ch])
12361 break;
12362 j--;
12363 }
12364 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012365 }
12366 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012367 else {
12368 int kind = PyUnicode_KIND(self);
12369 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012370
Victor Stinnercc7af722013-04-09 22:39:24 +020012371 i = 0;
12372 if (striptype != RIGHTSTRIP) {
12373 while (i < len) {
12374 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12375 if (!Py_UNICODE_ISSPACE(ch))
12376 break;
12377 i++;
12378 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012379 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012380
12381 j = len;
12382 if (striptype != LEFTSTRIP) {
12383 j--;
12384 while (j >= i) {
12385 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12386 if (!Py_UNICODE_ISSPACE(ch))
12387 break;
12388 j--;
12389 }
12390 j++;
12391 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012393
Victor Stinner7931d9a2011-11-04 00:22:48 +010012394 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395}
12396
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012397
12398static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012399do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012400{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012402
Serhiy Storchakac6792272013-10-19 21:03:34 +030012403 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012405
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 if (sep != NULL && sep != Py_None) {
12407 if (PyUnicode_Check(sep))
12408 return _PyUnicode_XStrip(self, striptype, sep);
12409 else {
12410 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 "%s arg must be None or str",
12412 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012413 return NULL;
12414 }
12415 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012416
Benjamin Peterson14339b62009-01-31 16:36:08 +000012417 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012418}
12419
12420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012421PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012423\n\
12424Return a copy of the string S with leading and trailing\n\
12425whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012426If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012427
12428static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012429unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012430{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012431 if (PyTuple_GET_SIZE(args) == 0)
12432 return do_strip(self, BOTHSTRIP); /* Common case */
12433 else
12434 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435}
12436
12437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012438PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440\n\
12441Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012442If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012443
12444static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012445unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012446{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012447 if (PyTuple_GET_SIZE(args) == 0)
12448 return do_strip(self, LEFTSTRIP); /* Common case */
12449 else
12450 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012451}
12452
12453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012454PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456\n\
12457Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012458If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459
12460static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012461unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012463 if (PyTuple_GET_SIZE(args) == 0)
12464 return do_strip(self, RIGHTSTRIP); /* Common case */
12465 else
12466 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012467}
12468
12469
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012471unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012473 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475
Serhiy Storchaka05997252013-01-26 12:14:02 +020012476 if (len < 1)
12477 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478
Victor Stinnerc4b49542011-12-11 22:44:26 +010012479 /* no repeat, return original string */
12480 if (len == 1)
12481 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012482
Benjamin Petersonbac79492012-01-14 13:34:47 -050012483 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 return NULL;
12485
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012486 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012487 PyErr_SetString(PyExc_OverflowError,
12488 "repeated string is too long");
12489 return NULL;
12490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012492
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012493 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494 if (!u)
12495 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012496 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 if (PyUnicode_GET_LENGTH(str) == 1) {
12499 const int kind = PyUnicode_KIND(str);
12500 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012501 if (kind == PyUnicode_1BYTE_KIND) {
12502 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012503 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012504 }
12505 else if (kind == PyUnicode_2BYTE_KIND) {
12506 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012507 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012508 ucs2[n] = fill_char;
12509 } else {
12510 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12511 assert(kind == PyUnicode_4BYTE_KIND);
12512 for (n = 0; n < len; ++n)
12513 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 }
12516 else {
12517 /* number of characters copied this far */
12518 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012519 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012521 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012525 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012526 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528 }
12529
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012530 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012531 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532}
12533
Alexander Belopolsky40018472011-02-26 01:02:56 +000012534PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012535PyUnicode_Replace(PyObject *str,
12536 PyObject *substr,
12537 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012538 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012540 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12541 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012543 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544}
12545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012546PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012547 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548\n\
12549Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012550old replaced by new. If the optional argument count is\n\
12551given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552
12553static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 PyObject *str1;
12557 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012558 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012560 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012562 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012563 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012564 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565}
12566
Alexander Belopolsky40018472011-02-26 01:02:56 +000012567static PyObject *
12568unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012570 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 Py_ssize_t isize;
12572 Py_ssize_t osize, squote, dquote, i, o;
12573 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012574 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012578 return NULL;
12579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 isize = PyUnicode_GET_LENGTH(unicode);
12581 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 /* Compute length of output, quote characters, and
12584 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012585 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 max = 127;
12587 squote = dquote = 0;
12588 ikind = PyUnicode_KIND(unicode);
12589 for (i = 0; i < isize; i++) {
12590 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012591 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012593 case '\'': squote++; break;
12594 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012596 incr = 2;
12597 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 default:
12599 /* Fast-path ASCII */
12600 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012601 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012603 ;
12604 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012607 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012609 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012611 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012613 if (osize > PY_SSIZE_T_MAX - incr) {
12614 PyErr_SetString(PyExc_OverflowError,
12615 "string is too long to generate repr");
12616 return NULL;
12617 }
12618 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 }
12620
12621 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012622 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012624 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 if (dquote)
12626 /* Both squote and dquote present. Use squote,
12627 and escape them */
12628 osize += squote;
12629 else
12630 quote = '"';
12631 }
Victor Stinner55c08782013-04-14 18:45:39 +020012632 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633
12634 repr = PyUnicode_New(osize, max);
12635 if (repr == NULL)
12636 return NULL;
12637 okind = PyUnicode_KIND(repr);
12638 odata = PyUnicode_DATA(repr);
12639
12640 PyUnicode_WRITE(okind, odata, 0, quote);
12641 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012642 if (unchanged) {
12643 _PyUnicode_FastCopyCharacters(repr, 1,
12644 unicode, 0,
12645 isize);
12646 }
12647 else {
12648 for (i = 0, o = 1; i < isize; i++) {
12649 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650
Victor Stinner55c08782013-04-14 18:45:39 +020012651 /* Escape quotes and backslashes */
12652 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012653 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012655 continue;
12656 }
12657
12658 /* Map special whitespace to '\t', \n', '\r' */
12659 if (ch == '\t') {
12660 PyUnicode_WRITE(okind, odata, o++, '\\');
12661 PyUnicode_WRITE(okind, odata, o++, 't');
12662 }
12663 else if (ch == '\n') {
12664 PyUnicode_WRITE(okind, odata, o++, '\\');
12665 PyUnicode_WRITE(okind, odata, o++, 'n');
12666 }
12667 else if (ch == '\r') {
12668 PyUnicode_WRITE(okind, odata, o++, '\\');
12669 PyUnicode_WRITE(okind, odata, o++, 'r');
12670 }
12671
12672 /* Map non-printable US ASCII to '\xhh' */
12673 else if (ch < ' ' || ch == 0x7F) {
12674 PyUnicode_WRITE(okind, odata, o++, '\\');
12675 PyUnicode_WRITE(okind, odata, o++, 'x');
12676 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12677 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12678 }
12679
12680 /* Copy ASCII characters as-is */
12681 else if (ch < 0x7F) {
12682 PyUnicode_WRITE(okind, odata, o++, ch);
12683 }
12684
12685 /* Non-ASCII characters */
12686 else {
12687 /* Map Unicode whitespace and control characters
12688 (categories Z* and C* except ASCII space)
12689 */
12690 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12691 PyUnicode_WRITE(okind, odata, o++, '\\');
12692 /* Map 8-bit characters to '\xhh' */
12693 if (ch <= 0xff) {
12694 PyUnicode_WRITE(okind, odata, o++, 'x');
12695 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12696 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12697 }
12698 /* Map 16-bit characters to '\uxxxx' */
12699 else if (ch <= 0xffff) {
12700 PyUnicode_WRITE(okind, odata, o++, 'u');
12701 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12702 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12703 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12704 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12705 }
12706 /* Map 21-bit characters to '\U00xxxxxx' */
12707 else {
12708 PyUnicode_WRITE(okind, odata, o++, 'U');
12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12713 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12714 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12715 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12716 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12717 }
12718 }
12719 /* Copy characters as-is */
12720 else {
12721 PyUnicode_WRITE(okind, odata, o++, ch);
12722 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012723 }
12724 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012727 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012728 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729}
12730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012731PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733\n\
12734Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012735such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736arguments start and end are interpreted as in slice notation.\n\
12737\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012738Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739
12740static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012743 /* initialize variables to prevent gcc warning */
12744 PyObject *substring = NULL;
12745 Py_ssize_t start = 0;
12746 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012747 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012749 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012752 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012755 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 if (result == -2)
12758 return NULL;
12759
Christian Heimes217cfd12007-12-02 14:31:20 +000012760 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761}
12762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012763PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765\n\
Mariatta577fc042017-04-09 15:17:06 -070012766Return the highest index in S where substring sub is found,\n\
12767such that sub is contained within S[start:end]. Optional\n\
12768arguments start and end are interpreted as in slice notation.\n\
12769\n\
12770Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771
12772static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012775 /* initialize variables to prevent gcc warning */
12776 PyObject *substring = NULL;
12777 Py_ssize_t start = 0;
12778 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012779 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012781 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012782 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012784 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012787 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 if (result == -2)
12790 return NULL;
12791
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792 if (result < 0) {
12793 PyErr_SetString(PyExc_ValueError, "substring not found");
12794 return NULL;
12795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796
Christian Heimes217cfd12007-12-02 14:31:20 +000012797 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798}
12799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012800PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012803Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012804done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
12806static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012807unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012809 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 Py_UCS4 fillchar = ' ';
12811
Victor Stinnere9a29352011-10-01 02:14:59 +020012812 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012814
Benjamin Petersonbac79492012-01-14 13:34:47 -050012815 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816 return NULL;
12817
Victor Stinnerc4b49542011-12-11 22:44:26 +010012818 if (PyUnicode_GET_LENGTH(self) >= width)
12819 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820
Victor Stinnerc4b49542011-12-11 22:44:26 +010012821 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822}
12823
Alexander Belopolsky40018472011-02-26 01:02:56 +000012824PyObject *
12825PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012827 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012830 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831}
12832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012833PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012834 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835\n\
12836Return a list of the words in S, using sep as the\n\
12837delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012838splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012839whitespace string is a separator and empty strings are\n\
12840removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841
12842static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012843unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012845 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012847 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012849 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12850 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851 return NULL;
12852
12853 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012855
12856 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012857 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858
12859 PyErr_Format(PyExc_TypeError,
12860 "must be str or None, not %.100s",
12861 Py_TYPE(substring)->tp_name);
12862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863}
12864
Thomas Wouters477c8d52006-05-27 19:21:47 +000012865PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012866PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012868 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012869 int kind1, kind2;
12870 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012872
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012873 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012875
Victor Stinner14f8f022011-10-05 20:58:25 +020012876 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 len1 = PyUnicode_GET_LENGTH(str_obj);
12879 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012880 if (kind1 < kind2 || len1 < len2) {
12881 _Py_INCREF_UNICODE_EMPTY();
12882 if (!unicode_empty)
12883 out = NULL;
12884 else {
12885 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12886 Py_DECREF(unicode_empty);
12887 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012888 return out;
12889 }
12890 buf1 = PyUnicode_DATA(str_obj);
12891 buf2 = PyUnicode_DATA(sep_obj);
12892 if (kind2 != kind1) {
12893 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12894 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012895 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012898 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012900 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12901 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12902 else
12903 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 break;
12905 case PyUnicode_2BYTE_KIND:
12906 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12907 break;
12908 case PyUnicode_4BYTE_KIND:
12909 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12910 break;
12911 default:
12912 assert(0);
12913 out = 0;
12914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012915
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012916 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012918
12919 return out;
12920}
12921
12922
12923PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012924PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012926 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012927 int kind1, kind2;
12928 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012934 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 len1 = PyUnicode_GET_LENGTH(str_obj);
12937 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012938 if (kind1 < kind2 || len1 < len2) {
12939 _Py_INCREF_UNICODE_EMPTY();
12940 if (!unicode_empty)
12941 out = NULL;
12942 else {
12943 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12944 Py_DECREF(unicode_empty);
12945 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012946 return out;
12947 }
12948 buf1 = PyUnicode_DATA(str_obj);
12949 buf2 = PyUnicode_DATA(sep_obj);
12950 if (kind2 != kind1) {
12951 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12952 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012953 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012956 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012958 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12959 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12960 else
12961 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 break;
12963 case PyUnicode_2BYTE_KIND:
12964 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 break;
12966 case PyUnicode_4BYTE_KIND:
12967 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968 break;
12969 default:
12970 assert(0);
12971 out = 0;
12972 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012974 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976
12977 return out;
12978}
12979
12980PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012982\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012983Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012985found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012986
12987static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012988unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989{
Victor Stinner9310abb2011-10-05 00:59:23 +020012990 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991}
12992
12993PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012994 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012996Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012997the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012998separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012999
13000static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013001unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013002{
Victor Stinner9310abb2011-10-05 00:59:23 +020013003 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013004}
13005
Alexander Belopolsky40018472011-02-26 01:02:56 +000013006PyObject *
13007PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013008{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013009 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013010 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013011
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013012 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013013}
13014
13015PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013016 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013017\n\
13018Return a list of the words in S, using sep as the\n\
13019delimiter string, starting at the end of the string and\n\
13020working to the front. If maxsplit is given, at most maxsplit\n\
13021splits are done. If sep is not specified, any whitespace string\n\
13022is a separator.");
13023
13024static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013025unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013026{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013027 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013028 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013029 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013030
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013031 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13032 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013033 return NULL;
13034
13035 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000013036 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013037
13038 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020013039 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013040
13041 PyErr_Format(PyExc_TypeError,
13042 "must be str or None, not %.100s",
13043 Py_TYPE(substring)->tp_name);
13044 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013045}
13046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013047PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049\n\
13050Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000013051Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013052is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053
13054static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013055unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013057 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000013058 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013060 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13061 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062 return NULL;
13063
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013064 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065}
13066
13067static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013068PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013070 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071}
13072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013073PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075\n\
13076Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013077and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078
13079static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013080unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013082 if (PyUnicode_READY(self) == -1)
13083 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013084 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085}
13086
Larry Hastings61272b72014-01-07 12:41:53 -080013087/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013088
Larry Hastings31826802013-10-19 00:09:25 -070013089@staticmethod
13090str.maketrans as unicode_maketrans
13091
13092 x: object
13093
13094 y: unicode=NULL
13095
13096 z: unicode=NULL
13097
13098 /
13099
13100Return a translation table usable for str.translate().
13101
13102If there is only one argument, it must be a dictionary mapping Unicode
13103ordinals (integers) or characters to Unicode ordinals, strings or None.
13104Character keys will be then converted to ordinals.
13105If there are two arguments, they must be strings of equal length, and
13106in the resulting dictionary, each character in x will be mapped to the
13107character at the same position in y. If there is a third argument, it
13108must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013109[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013110
Larry Hastings31826802013-10-19 00:09:25 -070013111static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013112unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013113/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013114{
Georg Brandlceee0772007-11-27 23:48:05 +000013115 PyObject *new = NULL, *key, *value;
13116 Py_ssize_t i = 0;
13117 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013118
Georg Brandlceee0772007-11-27 23:48:05 +000013119 new = PyDict_New();
13120 if (!new)
13121 return NULL;
13122 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 int x_kind, y_kind, z_kind;
13124 void *x_data, *y_data, *z_data;
13125
Georg Brandlceee0772007-11-27 23:48:05 +000013126 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013127 if (!PyUnicode_Check(x)) {
13128 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13129 "be a string if there is a second argument");
13130 goto err;
13131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013133 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13134 "arguments must have equal length");
13135 goto err;
13136 }
13137 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 x_kind = PyUnicode_KIND(x);
13139 y_kind = PyUnicode_KIND(y);
13140 x_data = PyUnicode_DATA(x);
13141 y_data = PyUnicode_DATA(y);
13142 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13143 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013144 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013145 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013146 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013147 if (!value) {
13148 Py_DECREF(key);
13149 goto err;
13150 }
Georg Brandlceee0772007-11-27 23:48:05 +000013151 res = PyDict_SetItem(new, key, value);
13152 Py_DECREF(key);
13153 Py_DECREF(value);
13154 if (res < 0)
13155 goto err;
13156 }
13157 /* create entries for deleting chars in z */
13158 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 z_kind = PyUnicode_KIND(z);
13160 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013161 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013163 if (!key)
13164 goto err;
13165 res = PyDict_SetItem(new, key, Py_None);
13166 Py_DECREF(key);
13167 if (res < 0)
13168 goto err;
13169 }
13170 }
13171 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 int kind;
13173 void *data;
13174
Georg Brandlceee0772007-11-27 23:48:05 +000013175 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013176 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013177 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13178 "to maketrans it must be a dict");
13179 goto err;
13180 }
13181 /* copy entries into the new dict, converting string keys to int keys */
13182 while (PyDict_Next(x, &i, &key, &value)) {
13183 if (PyUnicode_Check(key)) {
13184 /* convert string keys to integer keys */
13185 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013186 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013187 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13188 "table must be of length 1");
13189 goto err;
13190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 kind = PyUnicode_KIND(key);
13192 data = PyUnicode_DATA(key);
13193 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013194 if (!newkey)
13195 goto err;
13196 res = PyDict_SetItem(new, newkey, value);
13197 Py_DECREF(newkey);
13198 if (res < 0)
13199 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013200 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013201 /* just keep integer keys */
13202 if (PyDict_SetItem(new, key, value) < 0)
13203 goto err;
13204 } else {
13205 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13206 "be strings or integers");
13207 goto err;
13208 }
13209 }
13210 }
13211 return new;
13212 err:
13213 Py_DECREF(new);
13214 return NULL;
13215}
13216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013217PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013220Return a copy of the string S in which each character has been mapped\n\
13221through the given translation table. The table must implement\n\
13222lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13223mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13224this operation raises LookupError, the character is left untouched.\n\
13225Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226
13227static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231}
13232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013233PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013236Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237
13238static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013239unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013241 if (PyUnicode_READY(self) == -1)
13242 return NULL;
13243 if (PyUnicode_IS_ASCII(self))
13244 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013245 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246}
13247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013248PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013251Pad a numeric string S with zeros on the left, to fill a field\n\
13252of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253
13254static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013255unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013257 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013258 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013259 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 int kind;
13261 void *data;
13262 Py_UCS4 chr;
13263
Martin v. Löwis18e16552006-02-15 17:27:45 +000013264 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265 return NULL;
13266
Benjamin Petersonbac79492012-01-14 13:34:47 -050013267 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269
Victor Stinnerc4b49542011-12-11 22:44:26 +010013270 if (PyUnicode_GET_LENGTH(self) >= width)
13271 return unicode_result_unchanged(self);
13272
13273 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274
13275 u = pad(self, fill, 0, '0');
13276
Walter Dörwald068325e2002-04-15 13:36:47 +000013277 if (u == NULL)
13278 return NULL;
13279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 kind = PyUnicode_KIND(u);
13281 data = PyUnicode_DATA(u);
13282 chr = PyUnicode_READ(kind, data, fill);
13283
13284 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 PyUnicode_WRITE(kind, data, 0, chr);
13287 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288 }
13289
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013290 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013291 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293
13294#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013295static PyObject *
13296unicode__decimal2ascii(PyObject *self)
13297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013299}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300#endif
13301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013302PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013305Return True if S starts with the specified prefix, False otherwise.\n\
13306With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013307With optional end, stop comparing S at that position.\n\
13308prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
13310static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013311unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013314 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013315 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013316 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013317 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013318 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319
Jesus Ceaac451502011-04-20 17:09:23 +020013320 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013322 if (PyTuple_Check(subobj)) {
13323 Py_ssize_t i;
13324 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013325 substring = PyTuple_GET_ITEM(subobj, i);
13326 if (!PyUnicode_Check(substring)) {
13327 PyErr_Format(PyExc_TypeError,
13328 "tuple for startswith must only contain str, "
13329 "not %.100s",
13330 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013331 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013332 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013333 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013334 if (result == -1)
13335 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013336 if (result) {
13337 Py_RETURN_TRUE;
13338 }
13339 }
13340 /* nothing matched */
13341 Py_RETURN_FALSE;
13342 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013343 if (!PyUnicode_Check(subobj)) {
13344 PyErr_Format(PyExc_TypeError,
13345 "startswith first arg must be str or "
13346 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013347 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013348 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013349 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013350 if (result == -1)
13351 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013352 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353}
13354
13355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013356PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013359Return True if S ends with the specified suffix, False otherwise.\n\
13360With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013361With optional end, stop comparing S at that position.\n\
13362suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363
13364static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013365unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013368 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013369 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013370 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013371 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013372 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373
Jesus Ceaac451502011-04-20 17:09:23 +020013374 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013376 if (PyTuple_Check(subobj)) {
13377 Py_ssize_t i;
13378 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013379 substring = PyTuple_GET_ITEM(subobj, i);
13380 if (!PyUnicode_Check(substring)) {
13381 PyErr_Format(PyExc_TypeError,
13382 "tuple for endswith must only contain str, "
13383 "not %.100s",
13384 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013386 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013388 if (result == -1)
13389 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013390 if (result) {
13391 Py_RETURN_TRUE;
13392 }
13393 }
13394 Py_RETURN_FALSE;
13395 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396 if (!PyUnicode_Check(subobj)) {
13397 PyErr_Format(PyExc_TypeError,
13398 "endswith first arg must be str or "
13399 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013401 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013403 if (result == -1)
13404 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013405 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406}
13407
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013408static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013409_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013410{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013411 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13412 writer->data = PyUnicode_DATA(writer->buffer);
13413
13414 if (!writer->readonly) {
13415 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013416 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013417 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013418 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013419 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13420 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13421 writer->kind = PyUnicode_WCHAR_KIND;
13422 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13423
Victor Stinner8f674cc2013-04-17 23:02:17 +020013424 /* Copy-on-write mode: set buffer size to 0 so
13425 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13426 * next write. */
13427 writer->size = 0;
13428 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013429}
13430
Victor Stinnerd3f08822012-05-29 12:57:52 +020013431void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013432_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013433{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013434 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013435
13436 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013437 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013438
13439 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13440 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13441 writer->kind = PyUnicode_WCHAR_KIND;
13442 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013443}
13444
Victor Stinnerd3f08822012-05-29 12:57:52 +020013445int
13446_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13447 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013448{
13449 Py_ssize_t newlen;
13450 PyObject *newbuffer;
13451
Victor Stinner2740e462016-09-06 16:58:36 -070013452 assert(maxchar <= MAX_UNICODE);
13453
Victor Stinnerca9381e2015-09-22 00:58:32 +020013454 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013455 assert((maxchar > writer->maxchar && length >= 0)
13456 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013457
Victor Stinner202fdca2012-05-07 12:47:02 +020013458 if (length > PY_SSIZE_T_MAX - writer->pos) {
13459 PyErr_NoMemory();
13460 return -1;
13461 }
13462 newlen = writer->pos + length;
13463
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013464 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013465
Victor Stinnerd3f08822012-05-29 12:57:52 +020013466 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013467 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013468 if (writer->overallocate
13469 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13470 /* overallocate to limit the number of realloc() */
13471 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013472 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013473 if (newlen < writer->min_length)
13474 newlen = writer->min_length;
13475
Victor Stinnerd3f08822012-05-29 12:57:52 +020013476 writer->buffer = PyUnicode_New(newlen, maxchar);
13477 if (writer->buffer == NULL)
13478 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013479 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013480 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013481 if (writer->overallocate
13482 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13483 /* overallocate to limit the number of realloc() */
13484 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013486 if (newlen < writer->min_length)
13487 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013488
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013489 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013490 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013491 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013492 newbuffer = PyUnicode_New(newlen, maxchar);
13493 if (newbuffer == NULL)
13494 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013495 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13496 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013497 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013498 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013499 }
13500 else {
13501 newbuffer = resize_compact(writer->buffer, newlen);
13502 if (newbuffer == NULL)
13503 return -1;
13504 }
13505 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 }
13507 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013508 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509 newbuffer = PyUnicode_New(writer->size, maxchar);
13510 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013511 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013512 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13513 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013514 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013515 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013517 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013518
13519#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013520}
13521
Victor Stinnerca9381e2015-09-22 00:58:32 +020013522int
13523_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13524 enum PyUnicode_Kind kind)
13525{
13526 Py_UCS4 maxchar;
13527
13528 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13529 assert(writer->kind < kind);
13530
13531 switch (kind)
13532 {
13533 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13534 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13535 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13536 default:
13537 assert(0 && "invalid kind");
13538 return -1;
13539 }
13540
13541 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13542}
13543
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013544static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013545_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013546{
Victor Stinner2740e462016-09-06 16:58:36 -070013547 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013548 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13549 return -1;
13550 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13551 writer->pos++;
13552 return 0;
13553}
13554
13555int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013556_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13557{
13558 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13559}
13560
13561int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013562_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13563{
13564 Py_UCS4 maxchar;
13565 Py_ssize_t len;
13566
13567 if (PyUnicode_READY(str) == -1)
13568 return -1;
13569 len = PyUnicode_GET_LENGTH(str);
13570 if (len == 0)
13571 return 0;
13572 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13573 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013574 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013575 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013576 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 Py_INCREF(str);
13578 writer->buffer = str;
13579 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 writer->pos += len;
13581 return 0;
13582 }
13583 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13584 return -1;
13585 }
13586 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13587 str, 0, len);
13588 writer->pos += len;
13589 return 0;
13590}
13591
Victor Stinnere215d962012-10-06 23:03:36 +020013592int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013593_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13594 Py_ssize_t start, Py_ssize_t end)
13595{
13596 Py_UCS4 maxchar;
13597 Py_ssize_t len;
13598
13599 if (PyUnicode_READY(str) == -1)
13600 return -1;
13601
13602 assert(0 <= start);
13603 assert(end <= PyUnicode_GET_LENGTH(str));
13604 assert(start <= end);
13605
13606 if (end == 0)
13607 return 0;
13608
13609 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13610 return _PyUnicodeWriter_WriteStr(writer, str);
13611
13612 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13613 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13614 else
13615 maxchar = writer->maxchar;
13616 len = end - start;
13617
13618 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13619 return -1;
13620
13621 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13622 str, start, len);
13623 writer->pos += len;
13624 return 0;
13625}
13626
13627int
Victor Stinner4a587072013-11-19 12:54:53 +010013628_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13629 const char *ascii, Py_ssize_t len)
13630{
13631 if (len == -1)
13632 len = strlen(ascii);
13633
13634 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13635
13636 if (writer->buffer == NULL && !writer->overallocate) {
13637 PyObject *str;
13638
13639 str = _PyUnicode_FromASCII(ascii, len);
13640 if (str == NULL)
13641 return -1;
13642
13643 writer->readonly = 1;
13644 writer->buffer = str;
13645 _PyUnicodeWriter_Update(writer);
13646 writer->pos += len;
13647 return 0;
13648 }
13649
13650 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13651 return -1;
13652
13653 switch (writer->kind)
13654 {
13655 case PyUnicode_1BYTE_KIND:
13656 {
13657 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13658 Py_UCS1 *data = writer->data;
13659
Christian Heimesf051e432016-09-13 20:22:02 +020013660 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013661 break;
13662 }
13663 case PyUnicode_2BYTE_KIND:
13664 {
13665 _PyUnicode_CONVERT_BYTES(
13666 Py_UCS1, Py_UCS2,
13667 ascii, ascii + len,
13668 (Py_UCS2 *)writer->data + writer->pos);
13669 break;
13670 }
13671 case PyUnicode_4BYTE_KIND:
13672 {
13673 _PyUnicode_CONVERT_BYTES(
13674 Py_UCS1, Py_UCS4,
13675 ascii, ascii + len,
13676 (Py_UCS4 *)writer->data + writer->pos);
13677 break;
13678 }
13679 default:
13680 assert(0);
13681 }
13682
13683 writer->pos += len;
13684 return 0;
13685}
13686
13687int
13688_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13689 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013690{
13691 Py_UCS4 maxchar;
13692
13693 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13694 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13695 return -1;
13696 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13697 writer->pos += len;
13698 return 0;
13699}
13700
Victor Stinnerd3f08822012-05-29 12:57:52 +020013701PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013702_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013703{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013704 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013705
Victor Stinnerd3f08822012-05-29 12:57:52 +020013706 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013707 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013708 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013709 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013710
13711 str = writer->buffer;
13712 writer->buffer = NULL;
13713
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013714 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013715 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13716 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013717 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013718
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013719 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13720 PyObject *str2;
13721 str2 = resize_compact(str, writer->pos);
13722 if (str2 == NULL) {
13723 Py_DECREF(str);
13724 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013725 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013726 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013727 }
13728
Victor Stinner15a0bd32013-07-08 22:29:55 +020013729 assert(_PyUnicode_CheckConsistency(str, 1));
13730 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013731}
13732
Victor Stinnerd3f08822012-05-29 12:57:52 +020013733void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013734_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013735{
13736 Py_CLEAR(writer->buffer);
13737}
13738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013739#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013740
13741PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013743\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013744Return a formatted version of S, using substitutions from args and kwargs.\n\
13745The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013746
Eric Smith27bbca62010-11-04 17:06:58 +000013747PyDoc_STRVAR(format_map__doc__,
13748 "S.format_map(mapping) -> str\n\
13749\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013750Return a formatted version of S, using substitutions from mapping.\n\
13751The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013752
Eric Smith4a7d76d2008-05-30 18:10:19 +000013753static PyObject *
13754unicode__format__(PyObject* self, PyObject* args)
13755{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013756 PyObject *format_spec;
13757 _PyUnicodeWriter writer;
13758 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013759
13760 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13761 return NULL;
13762
Victor Stinnerd3f08822012-05-29 12:57:52 +020013763 if (PyUnicode_READY(self) == -1)
13764 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013765 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013766 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13767 self, format_spec, 0,
13768 PyUnicode_GET_LENGTH(format_spec));
13769 if (ret == -1) {
13770 _PyUnicodeWriter_Dealloc(&writer);
13771 return NULL;
13772 }
13773 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013774}
13775
Eric Smith8c663262007-08-25 02:26:07 +000013776PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013778\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013779Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013780
13781static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013782unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013784 Py_ssize_t size;
13785
13786 /* If it's a compact object, account for base structure +
13787 character data. */
13788 if (PyUnicode_IS_COMPACT_ASCII(v))
13789 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13790 else if (PyUnicode_IS_COMPACT(v))
13791 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013792 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013793 else {
13794 /* If it is a two-block object, account for base object, and
13795 for character block if present. */
13796 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013797 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013798 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013799 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013800 }
13801 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013802 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013803 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013805 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013806 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013807
13808 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013809}
13810
13811PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013812 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013813
13814static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013815unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013816{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013817 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013818 if (!copy)
13819 return NULL;
13820 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013821}
13822
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013824 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013825 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013826 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13827 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013828 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13829 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013830 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013831 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13832 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13833 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013834 {"expandtabs", (PyCFunction) unicode_expandtabs,
13835 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013836 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013837 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013838 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13839 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13840 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013841 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013842 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13843 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13844 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013845 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013846 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013847 {"splitlines", (PyCFunction) unicode_splitlines,
13848 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013849 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013850 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13851 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13852 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13853 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13854 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13855 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13856 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13857 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13858 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13859 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13860 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13861 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13862 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13863 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013864 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013865 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013866 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013867 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013868 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013869 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013870 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013871 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013872#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013873 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013874 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013875#endif
13876
Benjamin Peterson14339b62009-01-31 16:36:08 +000013877 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013878 {NULL, NULL}
13879};
13880
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013881static PyObject *
13882unicode_mod(PyObject *v, PyObject *w)
13883{
Brian Curtindfc80e32011-08-10 20:28:54 -050013884 if (!PyUnicode_Check(v))
13885 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013886 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013887}
13888
13889static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013890 0, /*nb_add*/
13891 0, /*nb_subtract*/
13892 0, /*nb_multiply*/
13893 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013894};
13895
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013897 (lenfunc) unicode_length, /* sq_length */
13898 PyUnicode_Concat, /* sq_concat */
13899 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13900 (ssizeargfunc) unicode_getitem, /* sq_item */
13901 0, /* sq_slice */
13902 0, /* sq_ass_item */
13903 0, /* sq_ass_slice */
13904 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013905};
13906
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013907static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013908unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013910 if (PyUnicode_READY(self) == -1)
13911 return NULL;
13912
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013913 if (PyIndex_Check(item)) {
13914 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013915 if (i == -1 && PyErr_Occurred())
13916 return NULL;
13917 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013918 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013919 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013920 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013921 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013922 PyObject *result;
13923 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013924 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013925 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013926
Serhiy Storchakac26b19d2017-04-08 11:18:14 +030013927 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013928 return NULL;
13929 }
Serhiy Storchakac26b19d2017-04-08 11:18:14 +030013930 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13931 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013932
13933 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013934 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013935 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013936 slicelength == PyUnicode_GET_LENGTH(self)) {
13937 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013938 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013939 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013940 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013941 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013942 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013943 src_kind = PyUnicode_KIND(self);
13944 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013945 if (!PyUnicode_IS_ASCII(self)) {
13946 kind_limit = kind_maxchar_limit(src_kind);
13947 max_char = 0;
13948 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13949 ch = PyUnicode_READ(src_kind, src_data, cur);
13950 if (ch > max_char) {
13951 max_char = ch;
13952 if (max_char >= kind_limit)
13953 break;
13954 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013955 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013956 }
Victor Stinner55c99112011-10-13 01:17:06 +020013957 else
13958 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013959 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013960 if (result == NULL)
13961 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013962 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013963 dest_data = PyUnicode_DATA(result);
13964
13965 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013966 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13967 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013968 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013969 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013970 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013971 } else {
13972 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13973 return NULL;
13974 }
13975}
13976
13977static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 (lenfunc)unicode_length, /* mp_length */
13979 (binaryfunc)unicode_subscript, /* mp_subscript */
13980 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013981};
13982
Guido van Rossumd57fd912000-03-10 22:53:23 +000013983
Guido van Rossumd57fd912000-03-10 22:53:23 +000013984/* Helpers for PyUnicode_Format() */
13985
Victor Stinnera47082312012-10-04 02:19:54 +020013986struct unicode_formatter_t {
13987 PyObject *args;
13988 int args_owned;
13989 Py_ssize_t arglen, argidx;
13990 PyObject *dict;
13991
13992 enum PyUnicode_Kind fmtkind;
13993 Py_ssize_t fmtcnt, fmtpos;
13994 void *fmtdata;
13995 PyObject *fmtstr;
13996
13997 _PyUnicodeWriter writer;
13998};
13999
14000struct unicode_format_arg_t {
14001 Py_UCS4 ch;
14002 int flags;
14003 Py_ssize_t width;
14004 int prec;
14005 int sign;
14006};
14007
Guido van Rossumd57fd912000-03-10 22:53:23 +000014008static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014009unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014010{
Victor Stinnera47082312012-10-04 02:19:54 +020014011 Py_ssize_t argidx = ctx->argidx;
14012
14013 if (argidx < ctx->arglen) {
14014 ctx->argidx++;
14015 if (ctx->arglen < 0)
14016 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 else
Victor Stinnera47082312012-10-04 02:19:54 +020014018 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014019 }
14020 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014021 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014022 return NULL;
14023}
14024
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014025/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014026
Victor Stinnera47082312012-10-04 02:19:54 +020014027/* Format a float into the writer if the writer is not NULL, or into *p_output
14028 otherwise.
14029
14030 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014031static int
Victor Stinnera47082312012-10-04 02:19:54 +020014032formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14033 PyObject **p_output,
14034 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014035{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014036 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014037 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014038 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014039 int prec;
14040 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014041
Guido van Rossumd57fd912000-03-10 22:53:23 +000014042 x = PyFloat_AsDouble(v);
14043 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014044 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014045
Victor Stinnera47082312012-10-04 02:19:54 +020014046 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014047 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014048 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014049
Victor Stinnera47082312012-10-04 02:19:54 +020014050 if (arg->flags & F_ALT)
14051 dtoa_flags = Py_DTSF_ALT;
14052 else
14053 dtoa_flags = 0;
14054 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014055 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014056 return -1;
14057 len = strlen(p);
14058 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014059 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014060 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014061 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014062 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014063 }
14064 else
14065 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014066 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014067 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014068}
14069
Victor Stinnerd0880d52012-04-27 23:40:13 +020014070/* formatlong() emulates the format codes d, u, o, x and X, and
14071 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14072 * Python's regular ints.
14073 * Return value: a new PyUnicodeObject*, or NULL if error.
14074 * The output string is of the form
14075 * "-"? ("0x" | "0X")? digit+
14076 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14077 * set in flags. The case of hex digits will be correct,
14078 * There will be at least prec digits, zero-filled on the left if
14079 * necessary to get that many.
14080 * val object to be converted
14081 * flags bitmask of format flags; only F_ALT is looked at
14082 * prec minimum number of digits; 0-fill on left if needed
14083 * type a character in [duoxX]; u acts the same as d
14084 *
14085 * CAUTION: o, x and X conversions on regular ints can never
14086 * produce a '-' sign, but can for Python's unbounded ints.
14087 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014088PyObject *
14089_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014090{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014091 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014092 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014093 Py_ssize_t i;
14094 int sign; /* 1 if '-', else 0 */
14095 int len; /* number of characters */
14096 Py_ssize_t llen;
14097 int numdigits; /* len == numnondigits + numdigits */
14098 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014099
Victor Stinnerd0880d52012-04-27 23:40:13 +020014100 /* Avoid exceeding SSIZE_T_MAX */
14101 if (prec > INT_MAX-3) {
14102 PyErr_SetString(PyExc_OverflowError,
14103 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014105 }
14106
14107 assert(PyLong_Check(val));
14108
14109 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014110 default:
14111 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014112 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014113 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014114 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014115 /* int and int subclasses should print numerically when a numeric */
14116 /* format code is used (see issue18780) */
14117 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014118 break;
14119 case 'o':
14120 numnondigits = 2;
14121 result = PyNumber_ToBase(val, 8);
14122 break;
14123 case 'x':
14124 case 'X':
14125 numnondigits = 2;
14126 result = PyNumber_ToBase(val, 16);
14127 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014128 }
14129 if (!result)
14130 return NULL;
14131
14132 assert(unicode_modifiable(result));
14133 assert(PyUnicode_IS_READY(result));
14134 assert(PyUnicode_IS_ASCII(result));
14135
14136 /* To modify the string in-place, there can only be one reference. */
14137 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014138 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014139 PyErr_BadInternalCall();
14140 return NULL;
14141 }
14142 buf = PyUnicode_DATA(result);
14143 llen = PyUnicode_GET_LENGTH(result);
14144 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014145 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014146 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014147 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014148 return NULL;
14149 }
14150 len = (int)llen;
14151 sign = buf[0] == '-';
14152 numnondigits += sign;
14153 numdigits = len - numnondigits;
14154 assert(numdigits > 0);
14155
14156 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014157 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014158 (type == 'o' || type == 'x' || type == 'X'))) {
14159 assert(buf[sign] == '0');
14160 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14161 buf[sign+1] == 'o');
14162 numnondigits -= 2;
14163 buf += 2;
14164 len -= 2;
14165 if (sign)
14166 buf[0] = '-';
14167 assert(len == numnondigits + numdigits);
14168 assert(numdigits > 0);
14169 }
14170
14171 /* Fill with leading zeroes to meet minimum width. */
14172 if (prec > numdigits) {
14173 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14174 numnondigits + prec);
14175 char *b1;
14176 if (!r1) {
14177 Py_DECREF(result);
14178 return NULL;
14179 }
14180 b1 = PyBytes_AS_STRING(r1);
14181 for (i = 0; i < numnondigits; ++i)
14182 *b1++ = *buf++;
14183 for (i = 0; i < prec - numdigits; i++)
14184 *b1++ = '0';
14185 for (i = 0; i < numdigits; i++)
14186 *b1++ = *buf++;
14187 *b1 = '\0';
14188 Py_DECREF(result);
14189 result = r1;
14190 buf = PyBytes_AS_STRING(result);
14191 len = numnondigits + prec;
14192 }
14193
14194 /* Fix up case for hex conversions. */
14195 if (type == 'X') {
14196 /* Need to convert all lower case letters to upper case.
14197 and need to convert 0x to 0X (and -0x to -0X). */
14198 for (i = 0; i < len; i++)
14199 if (buf[i] >= 'a' && buf[i] <= 'x')
14200 buf[i] -= 'a'-'A';
14201 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014202 if (!PyUnicode_Check(result)
14203 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014204 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014205 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014206 Py_DECREF(result);
14207 result = unicode;
14208 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014209 else if (len != PyUnicode_GET_LENGTH(result)) {
14210 if (PyUnicode_Resize(&result, len) < 0)
14211 Py_CLEAR(result);
14212 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014213 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014214}
14215
Ethan Furmandf3ed242014-01-05 06:50:30 -080014216/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014217 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014218 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014219 * -1 and raise an exception on error */
14220static int
Victor Stinnera47082312012-10-04 02:19:54 +020014221mainformatlong(PyObject *v,
14222 struct unicode_format_arg_t *arg,
14223 PyObject **p_output,
14224 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014225{
14226 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014227 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014228
14229 if (!PyNumber_Check(v))
14230 goto wrongtype;
14231
Ethan Furman9ab74802014-03-21 06:38:46 -070014232 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014233 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014234 if (type == 'o' || type == 'x' || type == 'X') {
14235 iobj = PyNumber_Index(v);
14236 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014237 if (PyErr_ExceptionMatches(PyExc_TypeError))
14238 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014239 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014240 }
14241 }
14242 else {
14243 iobj = PyNumber_Long(v);
14244 if (iobj == NULL ) {
14245 if (PyErr_ExceptionMatches(PyExc_TypeError))
14246 goto wrongtype;
14247 return -1;
14248 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014249 }
14250 assert(PyLong_Check(iobj));
14251 }
14252 else {
14253 iobj = v;
14254 Py_INCREF(iobj);
14255 }
14256
14257 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014258 && arg->width == -1 && arg->prec == -1
14259 && !(arg->flags & (F_SIGN | F_BLANK))
14260 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014261 {
14262 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014263 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 int base;
14265
Victor Stinnera47082312012-10-04 02:19:54 +020014266 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014267 {
14268 default:
14269 assert(0 && "'type' not in [diuoxX]");
14270 case 'd':
14271 case 'i':
14272 case 'u':
14273 base = 10;
14274 break;
14275 case 'o':
14276 base = 8;
14277 break;
14278 case 'x':
14279 case 'X':
14280 base = 16;
14281 break;
14282 }
14283
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014284 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14285 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014286 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014287 }
14288 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014289 return 1;
14290 }
14291
Ethan Furmanb95b5612015-01-23 20:05:18 -080014292 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014293 Py_DECREF(iobj);
14294 if (res == NULL)
14295 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014296 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014297 return 0;
14298
14299wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014300 switch(type)
14301 {
14302 case 'o':
14303 case 'x':
14304 case 'X':
14305 PyErr_Format(PyExc_TypeError,
14306 "%%%c format: an integer is required, "
14307 "not %.200s",
14308 type, Py_TYPE(v)->tp_name);
14309 break;
14310 default:
14311 PyErr_Format(PyExc_TypeError,
14312 "%%%c format: a number is required, "
14313 "not %.200s",
14314 type, Py_TYPE(v)->tp_name);
14315 break;
14316 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014317 return -1;
14318}
14319
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014320static Py_UCS4
14321formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014322{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014323 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014324 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014325 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014326 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014327 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014328 goto onError;
14329 }
14330 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014331 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014332 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014333 /* make sure number is a type of integer */
14334 if (!PyLong_Check(v)) {
14335 iobj = PyNumber_Index(v);
14336 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014337 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014338 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014339 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014340 Py_DECREF(iobj);
14341 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014342 else {
14343 x = PyLong_AsLong(v);
14344 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014345 if (x == -1 && PyErr_Occurred())
14346 goto onError;
14347
Victor Stinner8faf8212011-12-08 22:14:11 +010014348 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014349 PyErr_SetString(PyExc_OverflowError,
14350 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014351 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014352 }
14353
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014354 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014355 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014356
Benjamin Peterson29060642009-01-31 22:14:21 +000014357 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014358 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014359 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014360 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014361}
14362
Victor Stinnera47082312012-10-04 02:19:54 +020014363/* Parse options of an argument: flags, width, precision.
14364 Handle also "%(name)" syntax.
14365
14366 Return 0 if the argument has been formatted into arg->str.
14367 Return 1 if the argument has been written into ctx->writer,
14368 Raise an exception and return -1 on error. */
14369static int
14370unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14371 struct unicode_format_arg_t *arg)
14372{
14373#define FORMAT_READ(ctx) \
14374 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14375
14376 PyObject *v;
14377
Victor Stinnera47082312012-10-04 02:19:54 +020014378 if (arg->ch == '(') {
14379 /* Get argument value from a dictionary. Example: "%(name)s". */
14380 Py_ssize_t keystart;
14381 Py_ssize_t keylen;
14382 PyObject *key;
14383 int pcount = 1;
14384
14385 if (ctx->dict == NULL) {
14386 PyErr_SetString(PyExc_TypeError,
14387 "format requires a mapping");
14388 return -1;
14389 }
14390 ++ctx->fmtpos;
14391 --ctx->fmtcnt;
14392 keystart = ctx->fmtpos;
14393 /* Skip over balanced parentheses */
14394 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14395 arg->ch = FORMAT_READ(ctx);
14396 if (arg->ch == ')')
14397 --pcount;
14398 else if (arg->ch == '(')
14399 ++pcount;
14400 ctx->fmtpos++;
14401 }
14402 keylen = ctx->fmtpos - keystart - 1;
14403 if (ctx->fmtcnt < 0 || pcount > 0) {
14404 PyErr_SetString(PyExc_ValueError,
14405 "incomplete format key");
14406 return -1;
14407 }
14408 key = PyUnicode_Substring(ctx->fmtstr,
14409 keystart, keystart + keylen);
14410 if (key == NULL)
14411 return -1;
14412 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014413 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014414 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014415 }
14416 ctx->args = PyObject_GetItem(ctx->dict, key);
14417 Py_DECREF(key);
14418 if (ctx->args == NULL)
14419 return -1;
14420 ctx->args_owned = 1;
14421 ctx->arglen = -1;
14422 ctx->argidx = -2;
14423 }
14424
14425 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014426 while (--ctx->fmtcnt >= 0) {
14427 arg->ch = FORMAT_READ(ctx);
14428 ctx->fmtpos++;
14429 switch (arg->ch) {
14430 case '-': arg->flags |= F_LJUST; continue;
14431 case '+': arg->flags |= F_SIGN; continue;
14432 case ' ': arg->flags |= F_BLANK; continue;
14433 case '#': arg->flags |= F_ALT; continue;
14434 case '0': arg->flags |= F_ZERO; continue;
14435 }
14436 break;
14437 }
14438
14439 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014440 if (arg->ch == '*') {
14441 v = unicode_format_getnextarg(ctx);
14442 if (v == NULL)
14443 return -1;
14444 if (!PyLong_Check(v)) {
14445 PyErr_SetString(PyExc_TypeError,
14446 "* wants int");
14447 return -1;
14448 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014449 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014450 if (arg->width == -1 && PyErr_Occurred())
14451 return -1;
14452 if (arg->width < 0) {
14453 arg->flags |= F_LJUST;
14454 arg->width = -arg->width;
14455 }
14456 if (--ctx->fmtcnt >= 0) {
14457 arg->ch = FORMAT_READ(ctx);
14458 ctx->fmtpos++;
14459 }
14460 }
14461 else if (arg->ch >= '0' && arg->ch <= '9') {
14462 arg->width = arg->ch - '0';
14463 while (--ctx->fmtcnt >= 0) {
14464 arg->ch = FORMAT_READ(ctx);
14465 ctx->fmtpos++;
14466 if (arg->ch < '0' || arg->ch > '9')
14467 break;
14468 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14469 mixing signed and unsigned comparison. Since arg->ch is between
14470 '0' and '9', casting to int is safe. */
14471 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14472 PyErr_SetString(PyExc_ValueError,
14473 "width too big");
14474 return -1;
14475 }
14476 arg->width = arg->width*10 + (arg->ch - '0');
14477 }
14478 }
14479
14480 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014481 if (arg->ch == '.') {
14482 arg->prec = 0;
14483 if (--ctx->fmtcnt >= 0) {
14484 arg->ch = FORMAT_READ(ctx);
14485 ctx->fmtpos++;
14486 }
14487 if (arg->ch == '*') {
14488 v = unicode_format_getnextarg(ctx);
14489 if (v == NULL)
14490 return -1;
14491 if (!PyLong_Check(v)) {
14492 PyErr_SetString(PyExc_TypeError,
14493 "* wants int");
14494 return -1;
14495 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014496 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014497 if (arg->prec == -1 && PyErr_Occurred())
14498 return -1;
14499 if (arg->prec < 0)
14500 arg->prec = 0;
14501 if (--ctx->fmtcnt >= 0) {
14502 arg->ch = FORMAT_READ(ctx);
14503 ctx->fmtpos++;
14504 }
14505 }
14506 else if (arg->ch >= '0' && arg->ch <= '9') {
14507 arg->prec = arg->ch - '0';
14508 while (--ctx->fmtcnt >= 0) {
14509 arg->ch = FORMAT_READ(ctx);
14510 ctx->fmtpos++;
14511 if (arg->ch < '0' || arg->ch > '9')
14512 break;
14513 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14514 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014515 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014516 return -1;
14517 }
14518 arg->prec = arg->prec*10 + (arg->ch - '0');
14519 }
14520 }
14521 }
14522
14523 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14524 if (ctx->fmtcnt >= 0) {
14525 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14526 if (--ctx->fmtcnt >= 0) {
14527 arg->ch = FORMAT_READ(ctx);
14528 ctx->fmtpos++;
14529 }
14530 }
14531 }
14532 if (ctx->fmtcnt < 0) {
14533 PyErr_SetString(PyExc_ValueError,
14534 "incomplete format");
14535 return -1;
14536 }
14537 return 0;
14538
14539#undef FORMAT_READ
14540}
14541
14542/* Format one argument. Supported conversion specifiers:
14543
14544 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014545 - "i", "d", "u": int or float
14546 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014547 - "e", "E", "f", "F", "g", "G": float
14548 - "c": int or str (1 character)
14549
Victor Stinner8dbd4212012-12-04 09:30:24 +010014550 When possible, the output is written directly into the Unicode writer
14551 (ctx->writer). A string is created when padding is required.
14552
Victor Stinnera47082312012-10-04 02:19:54 +020014553 Return 0 if the argument has been formatted into *p_str,
14554 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014555 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014556static int
14557unicode_format_arg_format(struct unicode_formatter_t *ctx,
14558 struct unicode_format_arg_t *arg,
14559 PyObject **p_str)
14560{
14561 PyObject *v;
14562 _PyUnicodeWriter *writer = &ctx->writer;
14563
14564 if (ctx->fmtcnt == 0)
14565 ctx->writer.overallocate = 0;
14566
14567 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014568 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014569 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014570 return 1;
14571 }
14572
14573 v = unicode_format_getnextarg(ctx);
14574 if (v == NULL)
14575 return -1;
14576
Victor Stinnera47082312012-10-04 02:19:54 +020014577
14578 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014579 case 's':
14580 case 'r':
14581 case 'a':
14582 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14583 /* Fast path */
14584 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14585 return -1;
14586 return 1;
14587 }
14588
14589 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14590 *p_str = v;
14591 Py_INCREF(*p_str);
14592 }
14593 else {
14594 if (arg->ch == 's')
14595 *p_str = PyObject_Str(v);
14596 else if (arg->ch == 'r')
14597 *p_str = PyObject_Repr(v);
14598 else
14599 *p_str = PyObject_ASCII(v);
14600 }
14601 break;
14602
14603 case 'i':
14604 case 'd':
14605 case 'u':
14606 case 'o':
14607 case 'x':
14608 case 'X':
14609 {
14610 int ret = mainformatlong(v, arg, p_str, writer);
14611 if (ret != 0)
14612 return ret;
14613 arg->sign = 1;
14614 break;
14615 }
14616
14617 case 'e':
14618 case 'E':
14619 case 'f':
14620 case 'F':
14621 case 'g':
14622 case 'G':
14623 if (arg->width == -1 && arg->prec == -1
14624 && !(arg->flags & (F_SIGN | F_BLANK)))
14625 {
14626 /* Fast path */
14627 if (formatfloat(v, arg, NULL, writer) == -1)
14628 return -1;
14629 return 1;
14630 }
14631
14632 arg->sign = 1;
14633 if (formatfloat(v, arg, p_str, NULL) == -1)
14634 return -1;
14635 break;
14636
14637 case 'c':
14638 {
14639 Py_UCS4 ch = formatchar(v);
14640 if (ch == (Py_UCS4) -1)
14641 return -1;
14642 if (arg->width == -1 && arg->prec == -1) {
14643 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014644 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014645 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014646 return 1;
14647 }
14648 *p_str = PyUnicode_FromOrdinal(ch);
14649 break;
14650 }
14651
14652 default:
14653 PyErr_Format(PyExc_ValueError,
14654 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014655 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014656 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14657 (int)arg->ch,
14658 ctx->fmtpos - 1);
14659 return -1;
14660 }
14661 if (*p_str == NULL)
14662 return -1;
14663 assert (PyUnicode_Check(*p_str));
14664 return 0;
14665}
14666
14667static int
14668unicode_format_arg_output(struct unicode_formatter_t *ctx,
14669 struct unicode_format_arg_t *arg,
14670 PyObject *str)
14671{
14672 Py_ssize_t len;
14673 enum PyUnicode_Kind kind;
14674 void *pbuf;
14675 Py_ssize_t pindex;
14676 Py_UCS4 signchar;
14677 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014678 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014679 Py_ssize_t sublen;
14680 _PyUnicodeWriter *writer = &ctx->writer;
14681 Py_UCS4 fill;
14682
14683 fill = ' ';
14684 if (arg->sign && arg->flags & F_ZERO)
14685 fill = '0';
14686
14687 if (PyUnicode_READY(str) == -1)
14688 return -1;
14689
14690 len = PyUnicode_GET_LENGTH(str);
14691 if ((arg->width == -1 || arg->width <= len)
14692 && (arg->prec == -1 || arg->prec >= len)
14693 && !(arg->flags & (F_SIGN | F_BLANK)))
14694 {
14695 /* Fast path */
14696 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14697 return -1;
14698 return 0;
14699 }
14700
14701 /* Truncate the string for "s", "r" and "a" formats
14702 if the precision is set */
14703 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14704 if (arg->prec >= 0 && len > arg->prec)
14705 len = arg->prec;
14706 }
14707
14708 /* Adjust sign and width */
14709 kind = PyUnicode_KIND(str);
14710 pbuf = PyUnicode_DATA(str);
14711 pindex = 0;
14712 signchar = '\0';
14713 if (arg->sign) {
14714 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14715 if (ch == '-' || ch == '+') {
14716 signchar = ch;
14717 len--;
14718 pindex++;
14719 }
14720 else if (arg->flags & F_SIGN)
14721 signchar = '+';
14722 else if (arg->flags & F_BLANK)
14723 signchar = ' ';
14724 else
14725 arg->sign = 0;
14726 }
14727 if (arg->width < len)
14728 arg->width = len;
14729
14730 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014731 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014732 if (!(arg->flags & F_LJUST)) {
14733 if (arg->sign) {
14734 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014735 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014736 }
14737 else {
14738 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014739 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014740 }
14741 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014742 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14743 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014744 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014745 }
14746
Victor Stinnera47082312012-10-04 02:19:54 +020014747 buflen = arg->width;
14748 if (arg->sign && len == arg->width)
14749 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014750 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014751 return -1;
14752
14753 /* Write the sign if needed */
14754 if (arg->sign) {
14755 if (fill != ' ') {
14756 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14757 writer->pos += 1;
14758 }
14759 if (arg->width > len)
14760 arg->width--;
14761 }
14762
14763 /* Write the numeric prefix for "x", "X" and "o" formats
14764 if the alternate form is used.
14765 For example, write "0x" for the "%#x" format. */
14766 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14767 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14768 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14769 if (fill != ' ') {
14770 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14771 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14772 writer->pos += 2;
14773 pindex += 2;
14774 }
14775 arg->width -= 2;
14776 if (arg->width < 0)
14777 arg->width = 0;
14778 len -= 2;
14779 }
14780
14781 /* Pad left with the fill character if needed */
14782 if (arg->width > len && !(arg->flags & F_LJUST)) {
14783 sublen = arg->width - len;
14784 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14785 writer->pos += sublen;
14786 arg->width = len;
14787 }
14788
14789 /* If padding with spaces: write sign if needed and/or numeric prefix if
14790 the alternate form is used */
14791 if (fill == ' ') {
14792 if (arg->sign) {
14793 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14794 writer->pos += 1;
14795 }
14796 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14797 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14798 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14799 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14800 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14801 writer->pos += 2;
14802 pindex += 2;
14803 }
14804 }
14805
14806 /* Write characters */
14807 if (len) {
14808 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14809 str, pindex, len);
14810 writer->pos += len;
14811 }
14812
14813 /* Pad right with the fill character if needed */
14814 if (arg->width > len) {
14815 sublen = arg->width - len;
14816 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14817 writer->pos += sublen;
14818 }
14819 return 0;
14820}
14821
14822/* Helper of PyUnicode_Format(): format one arg.
14823 Return 0 on success, raise an exception and return -1 on error. */
14824static int
14825unicode_format_arg(struct unicode_formatter_t *ctx)
14826{
14827 struct unicode_format_arg_t arg;
14828 PyObject *str;
14829 int ret;
14830
Victor Stinner8dbd4212012-12-04 09:30:24 +010014831 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14832 arg.flags = 0;
14833 arg.width = -1;
14834 arg.prec = -1;
14835 arg.sign = 0;
14836 str = NULL;
14837
Victor Stinnera47082312012-10-04 02:19:54 +020014838 ret = unicode_format_arg_parse(ctx, &arg);
14839 if (ret == -1)
14840 return -1;
14841
14842 ret = unicode_format_arg_format(ctx, &arg, &str);
14843 if (ret == -1)
14844 return -1;
14845
14846 if (ret != 1) {
14847 ret = unicode_format_arg_output(ctx, &arg, str);
14848 Py_DECREF(str);
14849 if (ret == -1)
14850 return -1;
14851 }
14852
14853 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14854 PyErr_SetString(PyExc_TypeError,
14855 "not all arguments converted during string formatting");
14856 return -1;
14857 }
14858 return 0;
14859}
14860
Alexander Belopolsky40018472011-02-26 01:02:56 +000014861PyObject *
14862PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014863{
Victor Stinnera47082312012-10-04 02:19:54 +020014864 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014865
Guido van Rossumd57fd912000-03-10 22:53:23 +000014866 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014867 PyErr_BadInternalCall();
14868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014869 }
Victor Stinnera47082312012-10-04 02:19:54 +020014870
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014871 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014872 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014873
14874 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014875 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14876 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14877 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14878 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014879
Victor Stinner8f674cc2013-04-17 23:02:17 +020014880 _PyUnicodeWriter_Init(&ctx.writer);
14881 ctx.writer.min_length = ctx.fmtcnt + 100;
14882 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014883
Guido van Rossumd57fd912000-03-10 22:53:23 +000014884 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014885 ctx.arglen = PyTuple_Size(args);
14886 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014887 }
14888 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014889 ctx.arglen = -1;
14890 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014891 }
Victor Stinnera47082312012-10-04 02:19:54 +020014892 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014893 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014894 ctx.dict = args;
14895 else
14896 ctx.dict = NULL;
14897 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014898
Victor Stinnera47082312012-10-04 02:19:54 +020014899 while (--ctx.fmtcnt >= 0) {
14900 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014901 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014902
14903 nonfmtpos = ctx.fmtpos++;
14904 while (ctx.fmtcnt >= 0 &&
14905 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14906 ctx.fmtpos++;
14907 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014908 }
Victor Stinnera47082312012-10-04 02:19:54 +020014909 if (ctx.fmtcnt < 0) {
14910 ctx.fmtpos--;
14911 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014912 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014913
Victor Stinnercfc4c132013-04-03 01:48:39 +020014914 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14915 nonfmtpos, ctx.fmtpos) < 0)
14916 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014917 }
14918 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014919 ctx.fmtpos++;
14920 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014921 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014922 }
14923 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014924
Victor Stinnera47082312012-10-04 02:19:54 +020014925 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014926 PyErr_SetString(PyExc_TypeError,
14927 "not all arguments converted during string formatting");
14928 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014929 }
14930
Victor Stinnera47082312012-10-04 02:19:54 +020014931 if (ctx.args_owned) {
14932 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014933 }
Victor Stinnera47082312012-10-04 02:19:54 +020014934 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014935
Benjamin Peterson29060642009-01-31 22:14:21 +000014936 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014937 _PyUnicodeWriter_Dealloc(&ctx.writer);
14938 if (ctx.args_owned) {
14939 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940 }
14941 return NULL;
14942}
14943
Jeremy Hylton938ace62002-07-17 16:30:39 +000014944static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014945unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14946
Tim Peters6d6c1a32001-08-02 04:15:00 +000014947static PyObject *
14948unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14949{
Benjamin Peterson29060642009-01-31 22:14:21 +000014950 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014951 static char *kwlist[] = {"object", "encoding", "errors", 0};
14952 char *encoding = NULL;
14953 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014954
Benjamin Peterson14339b62009-01-31 16:36:08 +000014955 if (type != &PyUnicode_Type)
14956 return unicode_subtype_new(type, args, kwds);
14957 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014958 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014959 return NULL;
14960 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014961 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014962 if (encoding == NULL && errors == NULL)
14963 return PyObject_Str(x);
14964 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014965 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014966}
14967
Guido van Rossume023fe02001-08-30 03:12:59 +000014968static PyObject *
14969unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14970{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014971 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014972 Py_ssize_t length, char_size;
14973 int share_wstr, share_utf8;
14974 unsigned int kind;
14975 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014976
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014978
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014979 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014980 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014981 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014982 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014983 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014984 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014985 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014986 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014987
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014988 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014989 if (self == NULL) {
14990 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 return NULL;
14992 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014993 kind = PyUnicode_KIND(unicode);
14994 length = PyUnicode_GET_LENGTH(unicode);
14995
14996 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014997#ifdef Py_DEBUG
14998 _PyUnicode_HASH(self) = -1;
14999#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015000 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015001#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015002 _PyUnicode_STATE(self).interned = 0;
15003 _PyUnicode_STATE(self).kind = kind;
15004 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015005 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015006 _PyUnicode_STATE(self).ready = 1;
15007 _PyUnicode_WSTR(self) = NULL;
15008 _PyUnicode_UTF8_LENGTH(self) = 0;
15009 _PyUnicode_UTF8(self) = NULL;
15010 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015011 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015012
15013 share_utf8 = 0;
15014 share_wstr = 0;
15015 if (kind == PyUnicode_1BYTE_KIND) {
15016 char_size = 1;
15017 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15018 share_utf8 = 1;
15019 }
15020 else if (kind == PyUnicode_2BYTE_KIND) {
15021 char_size = 2;
15022 if (sizeof(wchar_t) == 2)
15023 share_wstr = 1;
15024 }
15025 else {
15026 assert(kind == PyUnicode_4BYTE_KIND);
15027 char_size = 4;
15028 if (sizeof(wchar_t) == 4)
15029 share_wstr = 1;
15030 }
15031
15032 /* Ensure we won't overflow the length. */
15033 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15034 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015035 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015036 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015037 data = PyObject_MALLOC((length + 1) * char_size);
15038 if (data == NULL) {
15039 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015040 goto onError;
15041 }
15042
Victor Stinnerc3c74152011-10-02 20:39:55 +020015043 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015044 if (share_utf8) {
15045 _PyUnicode_UTF8_LENGTH(self) = length;
15046 _PyUnicode_UTF8(self) = data;
15047 }
15048 if (share_wstr) {
15049 _PyUnicode_WSTR_LENGTH(self) = length;
15050 _PyUnicode_WSTR(self) = (wchar_t *)data;
15051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015052
Christian Heimesf051e432016-09-13 20:22:02 +020015053 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015054 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015055 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015056#ifdef Py_DEBUG
15057 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15058#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015059 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015060 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061
15062onError:
15063 Py_DECREF(unicode);
15064 Py_DECREF(self);
15065 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015066}
15067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015068PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015069"str(object='') -> str\n\
15070str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015071\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015072Create a new string object from the given object. If encoding or\n\
15073errors is specified, then the object must expose a data buffer\n\
15074that will be decoded using the given encoding and error handler.\n\
15075Otherwise, returns the result of object.__str__() (if defined)\n\
15076or repr(object).\n\
15077encoding defaults to sys.getdefaultencoding().\n\
15078errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015079
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015080static PyObject *unicode_iter(PyObject *seq);
15081
Guido van Rossumd57fd912000-03-10 22:53:23 +000015082PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015083 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015084 "str", /* tp_name */
15085 sizeof(PyUnicodeObject), /* tp_size */
15086 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015087 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015088 (destructor)unicode_dealloc, /* tp_dealloc */
15089 0, /* tp_print */
15090 0, /* tp_getattr */
15091 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015092 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 unicode_repr, /* tp_repr */
15094 &unicode_as_number, /* tp_as_number */
15095 &unicode_as_sequence, /* tp_as_sequence */
15096 &unicode_as_mapping, /* tp_as_mapping */
15097 (hashfunc) unicode_hash, /* tp_hash*/
15098 0, /* tp_call*/
15099 (reprfunc) unicode_str, /* tp_str */
15100 PyObject_GenericGetAttr, /* tp_getattro */
15101 0, /* tp_setattro */
15102 0, /* tp_as_buffer */
15103 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015104 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 unicode_doc, /* tp_doc */
15106 0, /* tp_traverse */
15107 0, /* tp_clear */
15108 PyUnicode_RichCompare, /* tp_richcompare */
15109 0, /* tp_weaklistoffset */
15110 unicode_iter, /* tp_iter */
15111 0, /* tp_iternext */
15112 unicode_methods, /* tp_methods */
15113 0, /* tp_members */
15114 0, /* tp_getset */
15115 &PyBaseObject_Type, /* tp_base */
15116 0, /* tp_dict */
15117 0, /* tp_descr_get */
15118 0, /* tp_descr_set */
15119 0, /* tp_dictoffset */
15120 0, /* tp_init */
15121 0, /* tp_alloc */
15122 unicode_new, /* tp_new */
15123 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015124};
15125
15126/* Initialize the Unicode implementation */
15127
Victor Stinner3a50e702011-10-18 21:21:00 +020015128int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015129{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015130 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015131 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015132 0x000A, /* LINE FEED */
15133 0x000D, /* CARRIAGE RETURN */
15134 0x001C, /* FILE SEPARATOR */
15135 0x001D, /* GROUP SEPARATOR */
15136 0x001E, /* RECORD SEPARATOR */
15137 0x0085, /* NEXT LINE */
15138 0x2028, /* LINE SEPARATOR */
15139 0x2029, /* PARAGRAPH SEPARATOR */
15140 };
15141
Fred Drakee4315f52000-05-09 19:53:39 +000015142 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015143 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015144 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015145 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015146 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015147
Guido van Rossumcacfc072002-05-24 19:01:59 +000015148 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015149 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015150
15151 /* initialize the linebreak bloom filter */
15152 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015153 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015154 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015155
Christian Heimes26532f72013-07-20 14:57:16 +020015156 if (PyType_Ready(&EncodingMapType) < 0)
15157 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015158
Benjamin Petersonc4311282012-10-30 23:21:10 -040015159 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15160 Py_FatalError("Can't initialize field name iterator type");
15161
15162 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15163 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015164
Victor Stinner3a50e702011-10-18 21:21:00 +020015165 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015166}
15167
15168/* Finalize the Unicode implementation */
15169
Christian Heimesa156e092008-02-16 07:38:31 +000015170int
15171PyUnicode_ClearFreeList(void)
15172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015173 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015174}
15175
Guido van Rossumd57fd912000-03-10 22:53:23 +000015176void
Thomas Wouters78890102000-07-22 19:25:51 +000015177_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015178{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015179 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015180
Serhiy Storchaka05997252013-01-26 12:14:02 +020015181 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015182
Serhiy Storchaka05997252013-01-26 12:14:02 +020015183 for (i = 0; i < 256; i++)
15184 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015185 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015186 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015187}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015188
Walter Dörwald16807132007-05-25 13:52:07 +000015189void
15190PyUnicode_InternInPlace(PyObject **p)
15191{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015192 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015193 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015194#ifdef Py_DEBUG
15195 assert(s != NULL);
15196 assert(_PyUnicode_CHECK(s));
15197#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015198 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015199 return;
15200#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015201 /* If it's a subclass, we don't really know what putting
15202 it in the interned dict might do. */
15203 if (!PyUnicode_CheckExact(s))
15204 return;
15205 if (PyUnicode_CHECK_INTERNED(s))
15206 return;
15207 if (interned == NULL) {
15208 interned = PyDict_New();
15209 if (interned == NULL) {
15210 PyErr_Clear(); /* Don't leave an exception */
15211 return;
15212 }
15213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015215 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015217 if (t == NULL) {
15218 PyErr_Clear();
15219 return;
15220 }
15221 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015222 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015223 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015224 return;
15225 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015226 /* The two references in interned are not counted by refcnt.
15227 The deallocator will take care of this */
15228 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015229 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015230}
15231
15232void
15233PyUnicode_InternImmortal(PyObject **p)
15234{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 PyUnicode_InternInPlace(p);
15236 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015237 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015238 Py_INCREF(*p);
15239 }
Walter Dörwald16807132007-05-25 13:52:07 +000015240}
15241
15242PyObject *
15243PyUnicode_InternFromString(const char *cp)
15244{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 PyObject *s = PyUnicode_FromString(cp);
15246 if (s == NULL)
15247 return NULL;
15248 PyUnicode_InternInPlace(&s);
15249 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015250}
15251
Alexander Belopolsky40018472011-02-26 01:02:56 +000015252void
15253_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015254{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015255 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015256 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 Py_ssize_t i, n;
15258 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015259
Benjamin Peterson14339b62009-01-31 16:36:08 +000015260 if (interned == NULL || !PyDict_Check(interned))
15261 return;
15262 keys = PyDict_Keys(interned);
15263 if (keys == NULL || !PyList_Check(keys)) {
15264 PyErr_Clear();
15265 return;
15266 }
Walter Dörwald16807132007-05-25 13:52:07 +000015267
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15269 detector, interned unicode strings are not forcibly deallocated;
15270 rather, we give them their stolen references back, and then clear
15271 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015272
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 n = PyList_GET_SIZE(keys);
15274 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015275 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015277 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015278 if (PyUnicode_READY(s) == -1) {
15279 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015280 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015282 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 case SSTATE_NOT_INTERNED:
15284 /* XXX Shouldn't happen */
15285 break;
15286 case SSTATE_INTERNED_IMMORTAL:
15287 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015288 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 break;
15290 case SSTATE_INTERNED_MORTAL:
15291 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015292 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 break;
15294 default:
15295 Py_FatalError("Inconsistent interned string state.");
15296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015297 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 }
15299 fprintf(stderr, "total size of all interned strings: "
15300 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15301 "mortal/immortal\n", mortal_size, immortal_size);
15302 Py_DECREF(keys);
15303 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015304 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015305}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015306
15307
15308/********************* Unicode Iterator **************************/
15309
15310typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 PyObject_HEAD
15312 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015313 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015314} unicodeiterobject;
15315
15316static void
15317unicodeiter_dealloc(unicodeiterobject *it)
15318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 _PyObject_GC_UNTRACK(it);
15320 Py_XDECREF(it->it_seq);
15321 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015322}
15323
15324static int
15325unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 Py_VISIT(it->it_seq);
15328 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015329}
15330
15331static PyObject *
15332unicodeiter_next(unicodeiterobject *it)
15333{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015334 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015335
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 assert(it != NULL);
15337 seq = it->it_seq;
15338 if (seq == NULL)
15339 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015340 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015342 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15343 int kind = PyUnicode_KIND(seq);
15344 void *data = PyUnicode_DATA(seq);
15345 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15346 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 if (item != NULL)
15348 ++it->it_index;
15349 return item;
15350 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015351
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015353 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015355}
15356
15357static PyObject *
15358unicodeiter_len(unicodeiterobject *it)
15359{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015360 Py_ssize_t len = 0;
15361 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015362 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015364}
15365
15366PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15367
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015368static PyObject *
15369unicodeiter_reduce(unicodeiterobject *it)
15370{
15371 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015372 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015373 it->it_seq, it->it_index);
15374 } else {
15375 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15376 if (u == NULL)
15377 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015378 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015379 }
15380}
15381
15382PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15383
15384static PyObject *
15385unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15386{
15387 Py_ssize_t index = PyLong_AsSsize_t(state);
15388 if (index == -1 && PyErr_Occurred())
15389 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015390 if (it->it_seq != NULL) {
15391 if (index < 0)
15392 index = 0;
15393 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15394 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15395 it->it_index = index;
15396 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015397 Py_RETURN_NONE;
15398}
15399
15400PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15401
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015402static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015404 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015405 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15406 reduce_doc},
15407 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15408 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410};
15411
15412PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015413 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15414 "str_iterator", /* tp_name */
15415 sizeof(unicodeiterobject), /* tp_basicsize */
15416 0, /* tp_itemsize */
15417 /* methods */
15418 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15419 0, /* tp_print */
15420 0, /* tp_getattr */
15421 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015422 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 0, /* tp_repr */
15424 0, /* tp_as_number */
15425 0, /* tp_as_sequence */
15426 0, /* tp_as_mapping */
15427 0, /* tp_hash */
15428 0, /* tp_call */
15429 0, /* tp_str */
15430 PyObject_GenericGetAttr, /* tp_getattro */
15431 0, /* tp_setattro */
15432 0, /* tp_as_buffer */
15433 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15434 0, /* tp_doc */
15435 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15436 0, /* tp_clear */
15437 0, /* tp_richcompare */
15438 0, /* tp_weaklistoffset */
15439 PyObject_SelfIter, /* tp_iter */
15440 (iternextfunc)unicodeiter_next, /* tp_iternext */
15441 unicodeiter_methods, /* tp_methods */
15442 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015443};
15444
15445static PyObject *
15446unicode_iter(PyObject *seq)
15447{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015448 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015449
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 if (!PyUnicode_Check(seq)) {
15451 PyErr_BadInternalCall();
15452 return NULL;
15453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015454 if (PyUnicode_READY(seq) == -1)
15455 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015456 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15457 if (it == NULL)
15458 return NULL;
15459 it->it_index = 0;
15460 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015461 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 _PyObject_GC_TRACK(it);
15463 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015464}
15465
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015466
15467size_t
15468Py_UNICODE_strlen(const Py_UNICODE *u)
15469{
15470 int res = 0;
15471 while(*u++)
15472 res++;
15473 return res;
15474}
15475
15476Py_UNICODE*
15477Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15478{
15479 Py_UNICODE *u = s1;
15480 while ((*u++ = *s2++));
15481 return s1;
15482}
15483
15484Py_UNICODE*
15485Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15486{
15487 Py_UNICODE *u = s1;
15488 while ((*u++ = *s2++))
15489 if (n-- == 0)
15490 break;
15491 return s1;
15492}
15493
15494Py_UNICODE*
15495Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15496{
15497 Py_UNICODE *u1 = s1;
15498 u1 += Py_UNICODE_strlen(u1);
15499 Py_UNICODE_strcpy(u1, s2);
15500 return s1;
15501}
15502
15503int
15504Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15505{
15506 while (*s1 && *s2 && *s1 == *s2)
15507 s1++, s2++;
15508 if (*s1 && *s2)
15509 return (*s1 < *s2) ? -1 : +1;
15510 if (*s1)
15511 return 1;
15512 if (*s2)
15513 return -1;
15514 return 0;
15515}
15516
15517int
15518Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15519{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015520 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015521 for (; n != 0; n--) {
15522 u1 = *s1;
15523 u2 = *s2;
15524 if (u1 != u2)
15525 return (u1 < u2) ? -1 : +1;
15526 if (u1 == '\0')
15527 return 0;
15528 s1++;
15529 s2++;
15530 }
15531 return 0;
15532}
15533
15534Py_UNICODE*
15535Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15536{
15537 const Py_UNICODE *p;
15538 for (p = s; *p; p++)
15539 if (*p == c)
15540 return (Py_UNICODE*)p;
15541 return NULL;
15542}
15543
15544Py_UNICODE*
15545Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15546{
15547 const Py_UNICODE *p;
15548 p = s + Py_UNICODE_strlen(s);
15549 while (p != s) {
15550 p--;
15551 if (*p == c)
15552 return (Py_UNICODE*)p;
15553 }
15554 return NULL;
15555}
Victor Stinner331ea922010-08-10 16:37:20 +000015556
Victor Stinner71133ff2010-09-01 23:43:53 +000015557Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015558PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015559{
Victor Stinner577db2c2011-10-11 22:12:48 +020015560 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015561 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015563 if (!PyUnicode_Check(unicode)) {
15564 PyErr_BadArgument();
15565 return NULL;
15566 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015567 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015568 if (u == NULL)
15569 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015570 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015571 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015572 PyErr_NoMemory();
15573 return NULL;
15574 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015575 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015576 size *= sizeof(Py_UNICODE);
15577 copy = PyMem_Malloc(size);
15578 if (copy == NULL) {
15579 PyErr_NoMemory();
15580 return NULL;
15581 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015582 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015583 return copy;
15584}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015585
Georg Brandl66c221e2010-10-14 07:04:07 +000015586/* A _string module, to export formatter_parser and formatter_field_name_split
15587 to the string.Formatter class implemented in Python. */
15588
15589static PyMethodDef _string_methods[] = {
15590 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15591 METH_O, PyDoc_STR("split the argument as a field name")},
15592 {"formatter_parser", (PyCFunction) formatter_parser,
15593 METH_O, PyDoc_STR("parse the argument as a format string")},
15594 {NULL, NULL}
15595};
15596
15597static struct PyModuleDef _string_module = {
15598 PyModuleDef_HEAD_INIT,
15599 "_string",
15600 PyDoc_STR("string helper module"),
15601 0,
15602 _string_methods,
15603 NULL,
15604 NULL,
15605 NULL,
15606 NULL
15607};
15608
15609PyMODINIT_FUNC
15610PyInit__string(void)
15611{
15612 return PyModule_Create(&_string_module);
15613}
15614
15615
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015616#ifdef __cplusplus
15617}
15618#endif