blob: 9223c9911e60682a912c5c04c3868a17cff023fb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
678/* Fill the data of an Unicode string with invalid characters to detect bugs
679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Victor Stinner84def372011-12-11 20:04:56 +0100725 _Py_DEC_REFTOTAL;
726 _Py_ForgetReference(unicode);
727
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300728 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100729 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100730 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 PyErr_NoMemory();
732 return NULL;
733 }
Victor Stinner84def372011-12-11 20:04:56 +0100734 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100736
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100740 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_WSTR_LENGTH(unicode) = length;
742 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100743 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
744 PyObject_DEL(_PyUnicode_WSTR(unicode));
745 _PyUnicode_WSTR(unicode) = NULL;
746 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200747#ifdef Py_DEBUG
748 unicode_fill_invalid(unicode, old_length);
749#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
751 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200752 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753 return unicode;
754}
755
Alexander Belopolsky40018472011-02-26 01:02:56 +0000756static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200757resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
Victor Stinner95663112011-10-04 01:03:50 +0200759 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100760 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200761 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200762 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 if (PyUnicode_IS_READY(unicode)) {
765 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200766 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
770#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771
772 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200773 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200774 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
775 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776
777 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
778 PyErr_NoMemory();
779 return -1;
780 }
781 new_size = (length + 1) * char_size;
782
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
784 {
785 PyObject_DEL(_PyUnicode_UTF8(unicode));
786 _PyUnicode_UTF8(unicode) = NULL;
787 _PyUnicode_UTF8_LENGTH(unicode) = 0;
788 }
789
Victor Stinnerfe226c02011-10-03 03:52:20 +0200790 data = (PyObject *)PyObject_REALLOC(data, new_size);
791 if (data == NULL) {
792 PyErr_NoMemory();
793 return -1;
794 }
795 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200796 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 _PyUnicode_WSTR_LENGTH(unicode) = length;
799 }
800 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200801 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200802 _PyUnicode_UTF8_LENGTH(unicode) = length;
803 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_LENGTH(unicode) = length;
805 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200806#ifdef Py_DEBUG
807 unicode_fill_invalid(unicode, old_length);
808#endif
Victor Stinner95663112011-10-04 01:03:50 +0200809 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200810 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
Victor Stinner95663112011-10-04 01:03:50 +0200814 assert(_PyUnicode_WSTR(unicode) != NULL);
815
816 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700817 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200818 PyErr_NoMemory();
819 return -1;
820 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100821 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200822 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200824 if (!wstr) {
825 PyErr_NoMemory();
826 return -1;
827 }
828 _PyUnicode_WSTR(unicode) = wstr;
829 _PyUnicode_WSTR(unicode)[length] = 0;
830 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200831 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832 return 0;
833}
834
Victor Stinnerfe226c02011-10-03 03:52:20 +0200835static PyObject*
836resize_copy(PyObject *unicode, Py_ssize_t length)
837{
838 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100839 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200840 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841
Benjamin Petersonbac79492012-01-14 13:34:47 -0500842 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200844
845 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
846 if (copy == NULL)
847 return NULL;
848
849 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200850 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200852 }
853 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200854 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100855
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200857 if (w == NULL)
858 return NULL;
859 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
860 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200861 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
862 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 }
865}
866
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000868 Ux0000 terminated; some code (e.g. new_identifier)
869 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000872 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
874*/
875
Alexander Belopolsky40018472011-02-26 01:02:56 +0000876static PyUnicodeObject *
877_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200879 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 if (length == 0 && unicode_empty != NULL) {
884 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200885 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886 }
887
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000888 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700889 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 return (PyUnicodeObject *)PyErr_NoMemory();
891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 if (length < 0) {
893 PyErr_SetString(PyExc_SystemError,
894 "Negative size passed to _PyUnicode_New");
895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896 }
897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
899 if (unicode == NULL)
900 return NULL;
901 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100902
903 _PyUnicode_WSTR_LENGTH(unicode) = length;
904 _PyUnicode_HASH(unicode) = -1;
905 _PyUnicode_STATE(unicode).interned = 0;
906 _PyUnicode_STATE(unicode).kind = 0;
907 _PyUnicode_STATE(unicode).compact = 0;
908 _PyUnicode_STATE(unicode).ready = 0;
909 _PyUnicode_STATE(unicode).ascii = 0;
910 _PyUnicode_DATA_ANY(unicode) = NULL;
911 _PyUnicode_LENGTH(unicode) = 0;
912 _PyUnicode_UTF8(unicode) = NULL;
913 _PyUnicode_UTF8_LENGTH(unicode) = 0;
914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
916 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921
Jeremy Hyltond8082792003-09-16 19:41:39 +0000922 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000923 * the caller fails before initializing str -- unicode_resize()
924 * reads str[0], and the Keep-Alive optimization can keep memory
925 * allocated for str alive across a call to unicode_dealloc(unicode).
926 * We don't want unicode_resize to read uninitialized memory in
927 * that case.
928 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 _PyUnicode_WSTR(unicode)[0] = 0;
930 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100931
Victor Stinner7931d9a2011-11-04 00:22:48 +0100932 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933 return unicode;
934}
935
Victor Stinnerf42dc442011-10-02 23:33:16 +0200936static const char*
937unicode_kind_name(PyObject *unicode)
938{
Victor Stinner42dfd712011-10-03 14:41:45 +0200939 /* don't check consistency: unicode_kind_name() is called from
940 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200941 if (!PyUnicode_IS_COMPACT(unicode))
942 {
943 if (!PyUnicode_IS_READY(unicode))
944 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600945 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 {
947 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200948 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200949 return "legacy ascii";
950 else
951 return "legacy latin1";
952 case PyUnicode_2BYTE_KIND:
953 return "legacy UCS2";
954 case PyUnicode_4BYTE_KIND:
955 return "legacy UCS4";
956 default:
957 return "<legacy invalid kind>";
958 }
959 }
960 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600961 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 return "ascii";
965 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200966 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200967 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 default:
972 return "<invalid compact kind>";
973 }
974}
975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977/* Functions wrapping macros for use in debugger */
978char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200979 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980}
981
982void *_PyUnicode_compact_data(void *unicode) {
983 return _PyUnicode_COMPACT_DATA(unicode);
984}
985void *_PyUnicode_data(void *unicode){
986 printf("obj %p\n", unicode);
987 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
988 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
989 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
990 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
991 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
992 return PyUnicode_DATA(unicode);
993}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200994
995void
996_PyUnicode_Dump(PyObject *op)
997{
998 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200999 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1000 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1001 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001002
Victor Stinnera849a4b2011-10-03 12:12:11 +02001003 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001004 {
1005 if (ascii->state.ascii)
1006 data = (ascii + 1);
1007 else
1008 data = (compact + 1);
1009 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 else
1011 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001012 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1013 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 if (ascii->wstr == data)
1016 printf("shared ");
1017 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001018
Victor Stinnera3b334d2011-10-03 13:53:37 +02001019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001020 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001023 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1024 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001536 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1537 PyErr_NoMemory();
1538 return -1;
1539 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001540 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1541 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyErr_NoMemory();
1543 return -1;
1544 }
1545 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1546 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001547 _PyUnicode_UTF8(unicode) = NULL;
1548 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001549 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1550 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001551 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552 PyObject_FREE(_PyUnicode_WSTR(unicode));
1553 _PyUnicode_WSTR(unicode) = NULL;
1554 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1555#else
1556 assert(num_surrogates == 0);
1557
Victor Stinnerc3c74152011-10-02 20:39:55 +02001558 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001560 _PyUnicode_UTF8(unicode) = NULL;
1561 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1563#endif
1564 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1565 }
1566 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001567 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 return 0;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001572unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573{
Walter Dörwald16807132007-05-25 13:52:07 +00001574 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 case SSTATE_NOT_INTERNED:
1576 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 case SSTATE_INTERNED_MORTAL:
1579 /* revive dead object temporarily for DelItem */
1580 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001581 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 Py_FatalError(
1583 "deletion of interned string failed");
1584 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001585
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 case SSTATE_INTERNED_IMMORTAL:
1587 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001588
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 default:
1590 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001591 }
1592
Victor Stinner03490912011-10-03 23:45:12 +02001593 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001595 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001596 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001597 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1598 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001600 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601}
1602
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001603#ifdef Py_DEBUG
1604static int
1605unicode_is_singleton(PyObject *unicode)
1606{
1607 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1608 if (unicode == unicode_empty)
1609 return 1;
1610 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1611 {
1612 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1613 if (ch < 256 && unicode_latin1[ch] == unicode)
1614 return 1;
1615 }
1616 return 0;
1617}
1618#endif
1619
Alexander Belopolsky40018472011-02-26 01:02:56 +00001620static int
Victor Stinner488fa492011-12-12 00:01:39 +01001621unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001622{
Victor Stinner488fa492011-12-12 00:01:39 +01001623 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (Py_REFCNT(unicode) != 1)
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (_PyUnicode_HASH(unicode) != -1)
1627 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_CHECK_INTERNED(unicode))
1629 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001630 if (!PyUnicode_CheckExact(unicode))
1631 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001632#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001633 /* singleton refcount is greater than 1 */
1634 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001635#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636 return 1;
1637}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001638
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639static int
1640unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1641{
1642 PyObject *unicode;
1643 Py_ssize_t old_length;
1644
1645 assert(p_unicode != NULL);
1646 unicode = *p_unicode;
1647
1648 assert(unicode != NULL);
1649 assert(PyUnicode_Check(unicode));
1650 assert(0 <= length);
1651
Victor Stinner910337b2011-10-03 03:20:16 +02001652 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001653 old_length = PyUnicode_WSTR_LENGTH(unicode);
1654 else
1655 old_length = PyUnicode_GET_LENGTH(unicode);
1656 if (old_length == length)
1657 return 0;
1658
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001660 _Py_INCREF_UNICODE_EMPTY();
1661 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 Py_DECREF(*p_unicode);
1664 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001665 return 0;
1666 }
1667
Victor Stinner488fa492011-12-12 00:01:39 +01001668 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001669 PyObject *copy = resize_copy(unicode, length);
1670 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001671 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 Py_DECREF(*p_unicode);
1673 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001674 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001675 }
1676
Victor Stinnerfe226c02011-10-03 03:52:20 +02001677 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001678 PyObject *new_unicode = resize_compact(unicode, length);
1679 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001681 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001683 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001684 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001685}
1686
Alexander Belopolsky40018472011-02-26 01:02:56 +00001687int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001689{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001690 PyObject *unicode;
1691 if (p_unicode == NULL) {
1692 PyErr_BadInternalCall();
1693 return -1;
1694 }
1695 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001696 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001697 {
1698 PyErr_BadInternalCall();
1699 return -1;
1700 }
1701 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001702}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001703
Victor Stinnerc5166102012-02-22 13:55:02 +01001704/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001705
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001706 WARNING: The function doesn't copy the terminating null character and
1707 doesn't check the maximum character (may write a latin1 character in an
1708 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001709static void
1710unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1711 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001712{
1713 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1714 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001715 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001716
1717 switch (kind) {
1718 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001719 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001720#ifdef Py_DEBUG
1721 if (PyUnicode_IS_ASCII(unicode)) {
1722 Py_UCS4 maxchar = ucs1lib_find_max_char(
1723 (const Py_UCS1*)str,
1724 (const Py_UCS1*)str + len);
1725 assert(maxchar < 128);
1726 }
1727#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001728 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001729 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001730 }
1731 case PyUnicode_2BYTE_KIND: {
1732 Py_UCS2 *start = (Py_UCS2 *)data + index;
1733 Py_UCS2 *ucs2 = start;
1734 assert(index <= PyUnicode_GET_LENGTH(unicode));
1735
Victor Stinner184252a2012-06-16 02:57:41 +02001736 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 *ucs2 = (Py_UCS2)*str;
1738
1739 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001740 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001741 }
1742 default: {
1743 Py_UCS4 *start = (Py_UCS4 *)data + index;
1744 Py_UCS4 *ucs4 = start;
1745 assert(kind == PyUnicode_4BYTE_KIND);
1746 assert(index <= PyUnicode_GET_LENGTH(unicode));
1747
Victor Stinner184252a2012-06-16 02:57:41 +02001748 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001749 *ucs4 = (Py_UCS4)*str;
1750
1751 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001752 }
1753 }
1754}
1755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756static PyObject*
1757get_latin1_char(unsigned char ch)
1758{
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 if (!unicode)
1763 return NULL;
1764 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001765 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 unicode_latin1[ch] = unicode;
1767 }
1768 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001769 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770}
1771
Victor Stinner985a82a2014-01-03 12:53:47 +01001772static PyObject*
1773unicode_char(Py_UCS4 ch)
1774{
1775 PyObject *unicode;
1776
1777 assert(ch <= MAX_UNICODE);
1778
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001779 if (ch < 256)
1780 return get_latin1_char(ch);
1781
Victor Stinner985a82a2014-01-03 12:53:47 +01001782 unicode = PyUnicode_New(1, ch);
1783 if (unicode == NULL)
1784 return NULL;
1785 switch (PyUnicode_KIND(unicode)) {
1786 case PyUnicode_1BYTE_KIND:
1787 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1788 break;
1789 case PyUnicode_2BYTE_KIND:
1790 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1791 break;
1792 default:
1793 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1794 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1795 }
1796 assert(_PyUnicode_CheckConsistency(unicode, 1));
1797 return unicode;
1798}
1799
Alexander Belopolsky40018472011-02-26 01:02:56 +00001800PyObject *
1801PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001803 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 Py_UCS4 maxchar = 0;
1805 Py_ssize_t num_surrogates;
1806
1807 if (u == NULL)
1808 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810 /* If the Unicode data is known at construction time, we can apply
1811 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001814 if (size == 0)
1815 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 /* Single character Unicode objects in the Latin-1 range are
1818 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001819 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 return get_latin1_char((unsigned char)*u);
1821
1822 /* If not empty and not single character, copy the Unicode data
1823 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001824 if (find_maxchar_surrogates(u, u + size,
1825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return NULL;
1827
Victor Stinner8faf8212011-12-08 22:14:11 +01001828 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 if (!unicode)
1830 return NULL;
1831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 switch (PyUnicode_KIND(unicode)) {
1833 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1836 break;
1837 case PyUnicode_2BYTE_KIND:
1838#if Py_UNICODE_SIZE == 2
1839 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1840#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001841 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1843#endif
1844 break;
1845 case PyUnicode_4BYTE_KIND:
1846#if SIZEOF_WCHAR_T == 2
1847 /* This is the only case which has to process surrogates, thus
1848 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001849 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850#else
1851 assert(num_surrogates == 0);
1852 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1853#endif
1854 break;
1855 default:
1856 assert(0 && "Impossible state");
1857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001859 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860}
1861
Alexander Belopolsky40018472011-02-26 01:02:56 +00001862PyObject *
1863PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001864{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 if (size < 0) {
1866 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 return NULL;
1869 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001870 if (u != NULL)
1871 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1872 else
1873 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001874}
1875
Alexander Belopolsky40018472011-02-26 01:02:56 +00001876PyObject *
1877PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001878{
1879 size_t size = strlen(u);
1880 if (size > PY_SSIZE_T_MAX) {
1881 PyErr_SetString(PyExc_OverflowError, "input too long");
1882 return NULL;
1883 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001884 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001885}
1886
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001887PyObject *
1888_PyUnicode_FromId(_Py_Identifier *id)
1889{
1890 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001891 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1892 strlen(id->string),
1893 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001894 if (!id->object)
1895 return NULL;
1896 PyUnicode_InternInPlace(&id->object);
1897 assert(!id->next);
1898 id->next = static_strings;
1899 static_strings = id;
1900 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001901 return id->object;
1902}
1903
1904void
1905_PyUnicode_ClearStaticStrings()
1906{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001907 _Py_Identifier *tmp, *s = static_strings;
1908 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001909 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 tmp = s->next;
1911 s->next = NULL;
1912 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001914 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001915}
1916
Benjamin Peterson0df54292012-03-26 14:50:32 -04001917/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918
Victor Stinnerd3f08822012-05-29 12:57:52 +02001919PyObject*
1920_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001921{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001922 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001923 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001924 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001926 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001927#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001928 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001929 }
Victor Stinner785938e2011-12-11 20:09:03 +01001930 unicode = PyUnicode_New(size, 127);
1931 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001932 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001933 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1934 assert(_PyUnicode_CheckConsistency(unicode, 1));
1935 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001936}
1937
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938static Py_UCS4
1939kind_maxchar_limit(unsigned int kind)
1940{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001941 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001942 case PyUnicode_1BYTE_KIND:
1943 return 0x80;
1944 case PyUnicode_2BYTE_KIND:
1945 return 0x100;
1946 case PyUnicode_4BYTE_KIND:
1947 return 0x10000;
1948 default:
1949 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001950 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001951 }
1952}
1953
Victor Stinnere6abb482012-05-02 01:15:40 +02001954Py_LOCAL_INLINE(Py_UCS4)
1955align_maxchar(Py_UCS4 maxchar)
1956{
1957 if (maxchar <= 127)
1958 return 127;
1959 else if (maxchar <= 255)
1960 return 255;
1961 else if (maxchar <= 65535)
1962 return 65535;
1963 else
1964 return MAX_UNICODE;
1965}
1966
Victor Stinner702c7342011-10-05 13:50:52 +02001967static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001968_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001972
Serhiy Storchaka678db842013-01-26 12:16:36 +02001973 if (size == 0)
1974 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001976 if (size == 1)
1977 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
1983 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001984 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001986}
1987
Victor Stinnere57b1c02011-09-28 22:20:48 +02001988static PyObject*
1989_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990{
1991 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001993
Serhiy Storchaka678db842013-01-26 12:16:36 +02001994 if (size == 0)
1995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001996 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 if (size == 1)
1998 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002000 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 if (!res)
2003 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002004 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002006 else {
2007 _PyUnicode_CONVERT_BYTES(
2008 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2009 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002010 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 return res;
2012}
2013
Victor Stinnere57b1c02011-09-28 22:20:48 +02002014static PyObject*
2015_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016{
2017 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002019
Serhiy Storchaka678db842013-01-26 12:16:36 +02002020 if (size == 0)
2021 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 if (size == 1)
2024 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002025
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002026 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002027 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 if (!res)
2029 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002030 if (max_char < 256)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2032 PyUnicode_1BYTE_DATA(res));
2033 else if (max_char < 0x10000)
2034 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2035 PyUnicode_2BYTE_DATA(res));
2036 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002038 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 return res;
2040}
2041
2042PyObject*
2043PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2044{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002045 if (size < 0) {
2046 PyErr_SetString(PyExc_ValueError, "size must be positive");
2047 return NULL;
2048 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002049 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002055 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002056 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002057 PyErr_SetString(PyExc_SystemError, "invalid kind");
2058 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060}
2061
Victor Stinnerece58de2012-04-23 23:36:38 +02002062Py_UCS4
2063_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2064{
2065 enum PyUnicode_Kind kind;
2066 void *startptr, *endptr;
2067
2068 assert(PyUnicode_IS_READY(unicode));
2069 assert(0 <= start);
2070 assert(end <= PyUnicode_GET_LENGTH(unicode));
2071 assert(start <= end);
2072
2073 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2074 return PyUnicode_MAX_CHAR_VALUE(unicode);
2075
2076 if (start == end)
2077 return 127;
2078
Victor Stinner94d558b2012-04-27 22:26:58 +02002079 if (PyUnicode_IS_ASCII(unicode))
2080 return 127;
2081
Victor Stinnerece58de2012-04-23 23:36:38 +02002082 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002083 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002084 endptr = (char *)startptr + end * kind;
2085 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002086 switch(kind) {
2087 case PyUnicode_1BYTE_KIND:
2088 return ucs1lib_find_max_char(startptr, endptr);
2089 case PyUnicode_2BYTE_KIND:
2090 return ucs2lib_find_max_char(startptr, endptr);
2091 case PyUnicode_4BYTE_KIND:
2092 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002094 assert(0);
2095 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002096 }
2097}
2098
Victor Stinner25a4b292011-10-06 12:31:55 +02002099/* Ensure that a string uses the most efficient storage, if it is not the
2100 case: create a new string with of the right kind. Write NULL into *p_unicode
2101 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002102static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002103unicode_adjust_maxchar(PyObject **p_unicode)
2104{
2105 PyObject *unicode, *copy;
2106 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002107 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002108 unsigned int kind;
2109
2110 assert(p_unicode != NULL);
2111 unicode = *p_unicode;
2112 assert(PyUnicode_IS_READY(unicode));
2113 if (PyUnicode_IS_ASCII(unicode))
2114 return;
2115
2116 len = PyUnicode_GET_LENGTH(unicode);
2117 kind = PyUnicode_KIND(unicode);
2118 if (kind == PyUnicode_1BYTE_KIND) {
2119 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002120 max_char = ucs1lib_find_max_char(u, u + len);
2121 if (max_char >= 128)
2122 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002123 }
2124 else if (kind == PyUnicode_2BYTE_KIND) {
2125 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs2lib_find_max_char(u, u + len);
2127 if (max_char >= 256)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
2130 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002133 max_char = ucs4lib_find_max_char(u, u + len);
2134 if (max_char >= 0x10000)
2135 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002138 if (copy != NULL)
2139 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002140 Py_DECREF(unicode);
2141 *p_unicode = copy;
2142}
2143
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002145_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146{
Victor Stinner87af4f22011-11-21 23:03:47 +01002147 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner034f6cf2011-09-30 02:26:44 +02002150 if (!PyUnicode_Check(unicode)) {
2151 PyErr_BadInternalCall();
2152 return NULL;
2153 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002154 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156
Victor Stinner87af4f22011-11-21 23:03:47 +01002157 length = PyUnicode_GET_LENGTH(unicode);
2158 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 if (!copy)
2160 return NULL;
2161 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2162
Victor Stinner87af4f22011-11-21 23:03:47 +01002163 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2164 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002165 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002166 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002167}
2168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
Victor Stinnerbc603d12011-10-02 01:00:40 +02002170/* Widen Unicode objects to larger buffers. Don't write terminating null
2171 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172
2173void*
2174_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2175{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002176 Py_ssize_t len;
2177 void *result;
2178 unsigned int skind;
2179
Benjamin Petersonbac79492012-01-14 13:34:47 -05002180 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 return NULL;
2182
2183 len = PyUnicode_GET_LENGTH(s);
2184 skind = PyUnicode_KIND(s);
2185 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002186 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 return NULL;
2188 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002189 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002190 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002191 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002192 if (!result)
2193 return PyErr_NoMemory();
2194 assert(skind == PyUnicode_1BYTE_KIND);
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS1, Py_UCS2,
2197 PyUnicode_1BYTE_DATA(s),
2198 PyUnicode_1BYTE_DATA(s) + len,
2199 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002201 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002202 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 if (!result)
2204 return PyErr_NoMemory();
2205 if (skind == PyUnicode_2BYTE_KIND) {
2206 _PyUnicode_CONVERT_BYTES(
2207 Py_UCS2, Py_UCS4,
2208 PyUnicode_2BYTE_DATA(s),
2209 PyUnicode_2BYTE_DATA(s) + len,
2210 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002212 else {
2213 assert(skind == PyUnicode_1BYTE_KIND);
2214 _PyUnicode_CONVERT_BYTES(
2215 Py_UCS1, Py_UCS4,
2216 PyUnicode_1BYTE_DATA(s),
2217 PyUnicode_1BYTE_DATA(s) + len,
2218 result);
2219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002221 default:
2222 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Victor Stinner01698042011-10-04 00:04:26 +02002224 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return NULL;
2226}
2227
2228static Py_UCS4*
2229as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2230 int copy_null)
2231{
2232 int kind;
2233 void *data;
2234 Py_ssize_t len, targetlen;
2235 if (PyUnicode_READY(string) == -1)
2236 return NULL;
2237 kind = PyUnicode_KIND(string);
2238 data = PyUnicode_DATA(string);
2239 len = PyUnicode_GET_LENGTH(string);
2240 targetlen = len;
2241 if (copy_null)
2242 targetlen++;
2243 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002244 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Victor Stinner15a11362012-10-06 23:48:20 +02002314/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002315 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2316 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2317#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002318
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002319static int
2320unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2321 Py_ssize_t width, Py_ssize_t precision)
2322{
2323 Py_ssize_t length, fill, arglen;
2324 Py_UCS4 maxchar;
2325
2326 if (PyUnicode_READY(str) == -1)
2327 return -1;
2328
2329 length = PyUnicode_GET_LENGTH(str);
2330 if ((precision == -1 || precision >= length)
2331 && width <= length)
2332 return _PyUnicodeWriter_WriteStr(writer, str);
2333
2334 if (precision != -1)
2335 length = Py_MIN(precision, length);
2336
2337 arglen = Py_MAX(length, width);
2338 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2339 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2340 else
2341 maxchar = writer->maxchar;
2342
2343 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2344 return -1;
2345
2346 if (width > length) {
2347 fill = width - length;
2348 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2349 return -1;
2350 writer->pos += fill;
2351 }
2352
2353 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2354 str, 0, length);
2355 writer->pos += length;
2356 return 0;
2357}
2358
2359static int
2360unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2361 Py_ssize_t width, Py_ssize_t precision)
2362{
2363 /* UTF-8 */
2364 Py_ssize_t length;
2365 PyObject *unicode;
2366 int res;
2367
2368 length = strlen(str);
2369 if (precision != -1)
2370 length = Py_MIN(length, precision);
2371 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2372 if (unicode == NULL)
2373 return -1;
2374
2375 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2376 Py_DECREF(unicode);
2377 return res;
2378}
2379
Victor Stinner96865452011-03-01 23:44:09 +00002380static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002381unicode_fromformat_arg(_PyUnicodeWriter *writer,
2382 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002383{
Victor Stinnere215d962012-10-06 23:03:36 +02002384 const char *p;
2385 Py_ssize_t len;
2386 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002387 Py_ssize_t width;
2388 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002389 int longflag;
2390 int longlongflag;
2391 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002392 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002393
2394 p = f;
2395 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002396 zeropad = 0;
2397 if (*f == '0') {
2398 zeropad = 1;
2399 f++;
2400 }
Victor Stinner96865452011-03-01 23:44:09 +00002401
2402 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002403 width = -1;
2404 if (Py_ISDIGIT((unsigned)*f)) {
2405 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002406 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002407 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002408 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002409 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002411 return NULL;
2412 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002414 f++;
2415 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 }
2417 precision = -1;
2418 if (*f == '.') {
2419 f++;
2420 if (Py_ISDIGIT((unsigned)*f)) {
2421 precision = (*f - '0');
2422 f++;
2423 while (Py_ISDIGIT((unsigned)*f)) {
2424 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2425 PyErr_SetString(PyExc_ValueError,
2426 "precision too big");
2427 return NULL;
2428 }
2429 precision = (precision * 10) + (*f - '0');
2430 f++;
2431 }
2432 }
Victor Stinner96865452011-03-01 23:44:09 +00002433 if (*f == '%') {
2434 /* "%.3%s" => f points to "3" */
2435 f--;
2436 }
2437 }
2438 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002439 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002440 f--;
2441 }
Victor Stinner96865452011-03-01 23:44:09 +00002442
2443 /* Handle %ld, %lu, %lld and %llu. */
2444 longflag = 0;
2445 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002446 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002447 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002448 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002449 longflag = 1;
2450 ++f;
2451 }
2452#ifdef HAVE_LONG_LONG
2453 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002454 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002455 longlongflag = 1;
2456 f += 2;
2457 }
2458#endif
2459 }
2460 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002461 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002462 size_tflag = 1;
2463 ++f;
2464 }
Victor Stinnere215d962012-10-06 23:03:36 +02002465
2466 if (f[1] == '\0')
2467 writer->overallocate = 0;
2468
2469 switch (*f) {
2470 case 'c':
2471 {
2472 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002473 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002474 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002475 "character argument not in range(0x110000)");
2476 return NULL;
2477 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002478 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002479 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002480 break;
2481 }
2482
2483 case 'i':
2484 case 'd':
2485 case 'u':
2486 case 'x':
2487 {
2488 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002489 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002490 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002493 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002494 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002495 va_arg(*vargs, unsigned long));
2496#ifdef HAVE_LONG_LONG
2497 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002498 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002499 va_arg(*vargs, unsigned PY_LONG_LONG));
2500#endif
2501 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002502 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002503 va_arg(*vargs, size_t));
2504 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002505 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002506 va_arg(*vargs, unsigned int));
2507 }
2508 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002509 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002510 }
2511 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002512 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002513 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002514 va_arg(*vargs, long));
2515#ifdef HAVE_LONG_LONG
2516 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002517 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002518 va_arg(*vargs, PY_LONG_LONG));
2519#endif
2520 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002521 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002522 va_arg(*vargs, Py_ssize_t));
2523 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002524 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002525 va_arg(*vargs, int));
2526 }
2527 assert(len >= 0);
2528
Victor Stinnere215d962012-10-06 23:03:36 +02002529 if (precision < len)
2530 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002531
2532 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2534 return NULL;
2535
Victor Stinnere215d962012-10-06 23:03:36 +02002536 if (width > precision) {
2537 Py_UCS4 fillchar;
2538 fill = width - precision;
2539 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2541 return NULL;
2542 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002543 }
Victor Stinner15a11362012-10-06 23:48:20 +02002544 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002545 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002546 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2547 return NULL;
2548 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002549 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002550
Victor Stinner4a587072013-11-19 12:54:53 +01002551 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2552 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002553 break;
2554 }
2555
2556 case 'p':
2557 {
2558 char number[MAX_LONG_LONG_CHARS];
2559
2560 len = sprintf(number, "%p", va_arg(*vargs, void*));
2561 assert(len >= 0);
2562
2563 /* %p is ill-defined: ensure leading 0x. */
2564 if (number[1] == 'X')
2565 number[1] = 'x';
2566 else if (number[1] != 'x') {
2567 memmove(number + 2, number,
2568 strlen(number) + 1);
2569 number[0] = '0';
2570 number[1] = 'x';
2571 len += 2;
2572 }
2573
Victor Stinner4a587072013-11-19 12:54:53 +01002574 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002575 return NULL;
2576 break;
2577 }
2578
2579 case 's':
2580 {
2581 /* UTF-8 */
2582 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002584 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002585 break;
2586 }
2587
2588 case 'U':
2589 {
2590 PyObject *obj = va_arg(*vargs, PyObject *);
2591 assert(obj && _PyUnicode_CHECK(obj));
2592
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002594 return NULL;
2595 break;
2596 }
2597
2598 case 'V':
2599 {
2600 PyObject *obj = va_arg(*vargs, PyObject *);
2601 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002602 if (obj) {
2603 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002605 return NULL;
2606 }
2607 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 assert(str != NULL);
2609 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002611 }
2612 break;
2613 }
2614
2615 case 'S':
2616 {
2617 PyObject *obj = va_arg(*vargs, PyObject *);
2618 PyObject *str;
2619 assert(obj);
2620 str = PyObject_Str(obj);
2621 if (!str)
2622 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002623 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002624 Py_DECREF(str);
2625 return NULL;
2626 }
2627 Py_DECREF(str);
2628 break;
2629 }
2630
2631 case 'R':
2632 {
2633 PyObject *obj = va_arg(*vargs, PyObject *);
2634 PyObject *repr;
2635 assert(obj);
2636 repr = PyObject_Repr(obj);
2637 if (!repr)
2638 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002640 Py_DECREF(repr);
2641 return NULL;
2642 }
2643 Py_DECREF(repr);
2644 break;
2645 }
2646
2647 case 'A':
2648 {
2649 PyObject *obj = va_arg(*vargs, PyObject *);
2650 PyObject *ascii;
2651 assert(obj);
2652 ascii = PyObject_ASCII(obj);
2653 if (!ascii)
2654 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002655 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002656 Py_DECREF(ascii);
2657 return NULL;
2658 }
2659 Py_DECREF(ascii);
2660 break;
2661 }
2662
2663 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667
2668 default:
2669 /* if we stumble upon an unknown formatting code, copy the rest
2670 of the format string to the output string. (we cannot just
2671 skip the code, since there's no way to know what's in the
2672 argument list) */
2673 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002674 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002675 return NULL;
2676 f = p+len;
2677 return f;
2678 }
2679
2680 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002681 return f;
2682}
2683
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684PyObject *
2685PyUnicode_FromFormatV(const char *format, va_list vargs)
2686{
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_list vargs2;
2688 const char *f;
2689 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002690
Victor Stinner8f674cc2013-04-17 23:02:17 +02002691 _PyUnicodeWriter_Init(&writer);
2692 writer.min_length = strlen(format) + 100;
2693 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2696 Copy it to be able to pass a reference to a subfunction. */
2697 Py_VA_COPY(vargs2, vargs);
2698
2699 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002701 f = unicode_fromformat_arg(&writer, f, &vargs2);
2702 if (f == NULL)
2703 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002705 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002706 const char *p;
2707 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
Victor Stinnere215d962012-10-06 23:03:36 +02002709 p = f;
2710 do
2711 {
2712 if ((unsigned char)*p > 127) {
2713 PyErr_Format(PyExc_ValueError,
2714 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2715 "string, got a non-ASCII byte: 0x%02x",
2716 (unsigned char)*p);
2717 return NULL;
2718 }
2719 p++;
2720 }
2721 while (*p != '\0' && *p != '%');
2722 len = p - f;
2723
2724 if (*p == '\0')
2725 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002726
2727 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002728 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002732 }
Victor Stinnere215d962012-10-06 23:03:36 +02002733 return _PyUnicodeWriter_Finish(&writer);
2734
2735 fail:
2736 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002738}
2739
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740PyObject *
2741PyUnicode_FromFormat(const char *format, ...)
2742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 PyObject* ret;
2744 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745
2746#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002748#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 ret = PyUnicode_FromFormatV(format, vargs);
2752 va_end(vargs);
2753 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002754}
2755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756#ifdef HAVE_WCHAR_H
2757
Victor Stinner5593d8a2010-10-02 11:11:27 +00002758/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2759 convert a Unicode object to a wide character string.
2760
Victor Stinnerd88d9832011-09-06 02:00:05 +02002761 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002762 character) required to convert the unicode object. Ignore size argument.
2763
Victor Stinnerd88d9832011-09-06 02:00:05 +02002764 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002765 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002768unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002769 wchar_t *w,
2770 Py_ssize_t size)
2771{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002772 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 const wchar_t *wstr;
2774
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002775 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 if (wstr == NULL)
2777 return -1;
2778
Victor Stinner5593d8a2010-10-02 11:11:27 +00002779 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 if (size > res)
2781 size = res + 1;
2782 else
2783 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 return res;
2786 }
2787 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002789}
2790
2791Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002792PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002793 wchar_t *w,
2794 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795{
2796 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 PyErr_BadInternalCall();
2798 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002800 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801}
2802
Victor Stinner137c34c2010-09-29 10:25:54 +00002803wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002804PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002805 Py_ssize_t *size)
2806{
2807 wchar_t* buffer;
2808 Py_ssize_t buflen;
2809
2810 if (unicode == NULL) {
2811 PyErr_BadInternalCall();
2812 return NULL;
2813 }
2814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002815 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 if (buflen == -1)
2817 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002818 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002819 if (buffer == NULL) {
2820 PyErr_NoMemory();
2821 return NULL;
2822 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002823 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002824 if (buflen == -1) {
2825 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002826 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002827 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 if (size != NULL)
2829 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 return buffer;
2831}
2832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
Alexander Belopolsky40018472011-02-26 01:02:56 +00002835PyObject *
2836PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002837{
Victor Stinner8faf8212011-12-08 22:14:11 +01002838 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 PyErr_SetString(PyExc_ValueError,
2840 "chr() arg not in range(0x110000)");
2841 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002843
Victor Stinner985a82a2014-01-03 12:53:47 +01002844 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002845}
2846
Alexander Belopolsky40018472011-02-26 01:02:56 +00002847PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002848PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002850 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002852 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002853 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002854 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 Py_INCREF(obj);
2856 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 }
2858 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 /* For a Unicode subtype that's not a Unicode object,
2860 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002861 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002862 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002863 PyErr_Format(PyExc_TypeError,
2864 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002865 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002866 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002867}
2868
Alexander Belopolsky40018472011-02-26 01:02:56 +00002869PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002870PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002871 const char *encoding,
2872 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002873{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002875 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002876
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 PyErr_BadInternalCall();
2879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002881
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002882 /* Decoding bytes objects is the most common case and should be fast */
2883 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002884 if (PyBytes_GET_SIZE(obj) == 0)
2885 _Py_RETURN_UNICODE_EMPTY();
2886 v = PyUnicode_Decode(
2887 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2888 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 return v;
2890 }
2891
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyErr_SetString(PyExc_TypeError,
2894 "decoding str is not supported");
2895 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002898 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2899 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2900 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002901 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002902 Py_TYPE(obj)->tp_name);
2903 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002904 }
Tim Petersced69f82003-09-16 20:30:58 +00002905
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002906 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002907 PyBuffer_Release(&buffer);
2908 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002910
Serhiy Storchaka05997252013-01-26 12:14:02 +02002911 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002912 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914}
2915
Victor Stinner600d3be2010-06-10 12:00:55 +00002916/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002917 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2918 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002919int
2920_Py_normalize_encoding(const char *encoding,
2921 char *lower,
2922 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002924 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002925 char *l;
2926 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002928 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002929 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002930 if (lower_len < 6)
2931 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002932 strcpy(lower, "utf-8");
2933 return 1;
2934 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002935 e = encoding;
2936 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002937 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002938 while (*e) {
2939 if (l == l_end)
2940 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002941 if (Py_ISUPPER(*e)) {
2942 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002943 }
2944 else if (*e == '_') {
2945 *l++ = '-';
2946 e++;
2947 }
2948 else {
2949 *l++ = *e++;
2950 }
2951 }
2952 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002953 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 Py_ssize_t size,
2959 const char *encoding,
2960 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002961{
2962 PyObject *buffer = NULL, *unicode;
2963 Py_buffer info;
2964 char lower[11]; /* Enough for any encoding shortcut */
2965
Fred Drakee4315f52000-05-09 19:53:39 +00002966 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002967 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002968 if ((strcmp(lower, "utf-8") == 0) ||
2969 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002970 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002971 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002972 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002973 (strcmp(lower, "iso-8859-1") == 0) ||
2974 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002975 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002976#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002977 else if (strcmp(lower, "mbcs") == 0)
2978 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002979#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002980 else if (strcmp(lower, "ascii") == 0)
2981 return PyUnicode_DecodeASCII(s, size, errors);
2982 else if (strcmp(lower, "utf-16") == 0)
2983 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2984 else if (strcmp(lower, "utf-32") == 0)
2985 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987
2988 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002989 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002990 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002991 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002992 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 if (buffer == NULL)
2994 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002995 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 if (unicode == NULL)
2997 goto onError;
2998 if (!PyUnicode_Check(unicode)) {
2999 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003000 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3001 "use codecs.decode() to decode to arbitrary types",
3002 encoding,
3003 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 Py_DECREF(unicode);
3005 goto onError;
3006 }
3007 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003008 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 Py_XDECREF(buffer);
3012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003034 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003035
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003037 return NULL;
3038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 const char *encoding,
3043 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044{
3045 PyObject *v;
3046
3047 if (!PyUnicode_Check(unicode)) {
3048 PyErr_BadArgument();
3049 goto onError;
3050 }
3051
3052 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003054
3055 /* Decode via the codec registry */
3056 v = PyCodec_Decode(unicode, encoding, errors);
3057 if (v == NULL)
3058 goto onError;
3059 if (!PyUnicode_Check(v)) {
3060 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003061 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3062 "use codecs.decode() to decode to arbitrary types",
3063 encoding,
3064 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003065 Py_DECREF(v);
3066 goto onError;
3067 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003068 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003069
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003071 return NULL;
3072}
3073
Alexander Belopolsky40018472011-02-26 01:02:56 +00003074PyObject *
3075PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003076 Py_ssize_t size,
3077 const char *encoding,
3078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079{
3080 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 unicode = PyUnicode_FromUnicode(s, size);
3083 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3086 Py_DECREF(unicode);
3087 return v;
3088}
3089
Alexander Belopolsky40018472011-02-26 01:02:56 +00003090PyObject *
3091PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003092 const char *encoding,
3093 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094{
3095 PyObject *v;
3096
3097 if (!PyUnicode_Check(unicode)) {
3098 PyErr_BadArgument();
3099 goto onError;
3100 }
3101
3102 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003104
3105 /* Encode via the codec registry */
3106 v = PyCodec_Encode(unicode, encoding, errors);
3107 if (v == NULL)
3108 goto onError;
3109 return v;
3110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003112 return NULL;
3113}
3114
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003115static size_t
3116wcstombs_errorpos(const wchar_t *wstr)
3117{
3118 size_t len;
3119#if SIZEOF_WCHAR_T == 2
3120 wchar_t buf[3];
3121#else
3122 wchar_t buf[2];
3123#endif
3124 char outbuf[MB_LEN_MAX];
3125 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127#if SIZEOF_WCHAR_T == 2
3128 buf[2] = 0;
3129#else
3130 buf[1] = 0;
3131#endif
3132 start = wstr;
3133 while (*wstr != L'\0')
3134 {
3135 previous = wstr;
3136#if SIZEOF_WCHAR_T == 2
3137 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3138 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3139 {
3140 buf[0] = wstr[0];
3141 buf[1] = wstr[1];
3142 wstr += 2;
3143 }
3144 else {
3145 buf[0] = *wstr;
3146 buf[1] = 0;
3147 wstr++;
3148 }
3149#else
3150 buf[0] = *wstr;
3151 wstr++;
3152#endif
3153 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003154 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003155 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156 }
3157
3158 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003159 return 0;
3160}
3161
Victor Stinner1b579672011-12-17 05:47:23 +01003162static int
3163locale_error_handler(const char *errors, int *surrogateescape)
3164{
3165 if (errors == NULL) {
3166 *surrogateescape = 0;
3167 return 0;
3168 }
3169
3170 if (strcmp(errors, "strict") == 0) {
3171 *surrogateescape = 0;
3172 return 0;
3173 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003174 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003175 *surrogateescape = 1;
3176 return 0;
3177 }
3178 PyErr_Format(PyExc_ValueError,
3179 "only 'strict' and 'surrogateescape' error handlers "
3180 "are supported, not '%s'",
3181 errors);
3182 return -1;
3183}
3184
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003185PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003186PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187{
3188 Py_ssize_t wlen, wlen2;
3189 wchar_t *wstr;
3190 PyObject *bytes = NULL;
3191 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003192 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003193 PyObject *exc;
3194 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003195 int surrogateescape;
3196
3197 if (locale_error_handler(errors, &surrogateescape) < 0)
3198 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003199
3200 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3201 if (wstr == NULL)
3202 return NULL;
3203
3204 wlen2 = wcslen(wstr);
3205 if (wlen2 != wlen) {
3206 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003207 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208 return NULL;
3209 }
3210
3211 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003212 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 char *str;
3214
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003215 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 if (str == NULL) {
3217 if (error_pos == (size_t)-1) {
3218 PyErr_NoMemory();
3219 PyMem_Free(wstr);
3220 return NULL;
3221 }
3222 else {
3223 goto encode_error;
3224 }
3225 }
3226 PyMem_Free(wstr);
3227
3228 bytes = PyBytes_FromString(str);
3229 PyMem_Free(str);
3230 }
3231 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003232 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233 size_t len, len2;
3234
3235 len = wcstombs(NULL, wstr, 0);
3236 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003237 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 goto encode_error;
3239 }
3240
3241 bytes = PyBytes_FromStringAndSize(NULL, len);
3242 if (bytes == NULL) {
3243 PyMem_Free(wstr);
3244 return NULL;
3245 }
3246
3247 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3248 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003249 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250 goto encode_error;
3251 }
3252 PyMem_Free(wstr);
3253 }
3254 return bytes;
3255
3256encode_error:
3257 errmsg = strerror(errno);
3258 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003259
3260 if (error_pos == (size_t)-1)
3261 error_pos = wcstombs_errorpos(wstr);
3262
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263 PyMem_Free(wstr);
3264 Py_XDECREF(bytes);
3265
Victor Stinner2f197072011-12-17 07:08:30 +01003266 if (errmsg != NULL) {
3267 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003268 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003269 if (wstr != NULL) {
3270 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003271 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003272 } else
3273 errmsg = NULL;
3274 }
3275 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003276 reason = PyUnicode_FromString(
3277 "wcstombs() encountered an unencodable "
3278 "wide character");
3279 if (reason == NULL)
3280 return NULL;
3281
3282 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3283 "locale", unicode,
3284 (Py_ssize_t)error_pos,
3285 (Py_ssize_t)(error_pos+1),
3286 reason);
3287 Py_DECREF(reason);
3288 if (exc != NULL) {
3289 PyCodec_StrictErrors(exc);
3290 Py_XDECREF(exc);
3291 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 return NULL;
3293}
3294
Victor Stinnerad158722010-10-27 00:25:46 +00003295PyObject *
3296PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003297{
Victor Stinner99b95382011-07-04 14:23:54 +02003298#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003299 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003300#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003302#else
Victor Stinner793b5312011-04-27 00:24:21 +02003303 PyInterpreterState *interp = PyThreadState_GET()->interp;
3304 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3305 cannot use it to encode and decode filenames before it is loaded. Load
3306 the Python codec requires to encode at least its own filename. Use the C
3307 version of the locale codec until the codec registry is initialized and
3308 the Python codec is loaded.
3309
3310 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3311 cannot only rely on it: check also interp->fscodec_initialized for
3312 subinterpreters. */
3313 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003314 return PyUnicode_AsEncodedString(unicode,
3315 Py_FileSystemDefaultEncoding,
3316 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003317 }
3318 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003319 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003320 }
Victor Stinnerad158722010-10-27 00:25:46 +00003321#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003322}
3323
Alexander Belopolsky40018472011-02-26 01:02:56 +00003324PyObject *
3325PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003326 const char *encoding,
3327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328{
3329 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003330 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (!PyUnicode_Check(unicode)) {
3333 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
Fred Drakee4315f52000-05-09 19:53:39 +00003336
Fred Drakee4315f52000-05-09 19:53:39 +00003337 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003338 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003339 if ((strcmp(lower, "utf-8") == 0) ||
3340 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003341 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003342 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003344 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003346 }
Victor Stinner37296e82010-06-10 13:36:23 +00003347 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003348 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003349 (strcmp(lower, "iso-8859-1") == 0) ||
3350 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003352#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003353 else if (strcmp(lower, "mbcs") == 0)
3354 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003355#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003356 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
3360 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003361 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003363 return NULL;
3364
3365 /* The normal path */
3366 if (PyBytes_Check(v))
3367 return v;
3368
3369 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003371 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003372 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003373
3374 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003375 "encoder %s returned bytearray instead of bytes; "
3376 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003377 encoding);
3378 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003379 Py_DECREF(v);
3380 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003382
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003383 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3384 Py_DECREF(v);
3385 return b;
3386 }
3387
3388 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003389 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3390 "use codecs.encode() to encode to arbitrary types",
3391 encoding,
3392 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003393 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003394 return NULL;
3395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
3409 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411
3412 /* Encode via the codec registry */
3413 v = PyCodec_Encode(unicode, encoding, errors);
3414 if (v == NULL)
3415 goto onError;
3416 if (!PyUnicode_Check(v)) {
3417 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003418 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3419 "use codecs.encode() to encode to arbitrary types",
3420 encoding,
3421 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 Py_DECREF(v);
3423 goto onError;
3424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003426
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 return NULL;
3429}
3430
Victor Stinner2f197072011-12-17 07:08:30 +01003431static size_t
3432mbstowcs_errorpos(const char *str, size_t len)
3433{
3434#ifdef HAVE_MBRTOWC
3435 const char *start = str;
3436 mbstate_t mbs;
3437 size_t converted;
3438 wchar_t ch;
3439
3440 memset(&mbs, 0, sizeof mbs);
3441 while (len)
3442 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003443 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003444 if (converted == 0)
3445 /* Reached end of string */
3446 break;
3447 if (converted == (size_t)-1 || converted == (size_t)-2) {
3448 /* Conversion error or incomplete character */
3449 return str - start;
3450 }
3451 else {
3452 str += converted;
3453 len -= converted;
3454 }
3455 }
3456 /* failed to find the undecodable byte sequence */
3457 return 0;
3458#endif
3459 return 0;
3460}
3461
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003462PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003463PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003464 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003465{
3466 wchar_t smallbuf[256];
3467 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3468 wchar_t *wstr;
3469 size_t wlen, wlen2;
3470 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003471 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003472 size_t error_pos;
3473 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003474 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3475 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003476
3477 if (locale_error_handler(errors, &surrogateescape) < 0)
3478 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003479
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003480 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3481 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482 return NULL;
3483 }
3484
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003485 if (surrogateescape) {
3486 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003487 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 if (wstr == NULL) {
3489 if (wlen == (size_t)-1)
3490 PyErr_NoMemory();
3491 else
3492 PyErr_SetFromErrno(PyExc_OSError);
3493 return NULL;
3494 }
3495
3496 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003497 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003498 }
3499 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003500 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501#ifndef HAVE_BROKEN_MBSTOWCS
3502 wlen = mbstowcs(NULL, str, 0);
3503#else
3504 wlen = len;
3505#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003506 if (wlen == (size_t)-1)
3507 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508 if (wlen+1 <= smallbuf_len) {
3509 wstr = smallbuf;
3510 }
3511 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003512 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003513 if (!wstr)
3514 return PyErr_NoMemory();
3515 }
3516
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003517 wlen2 = mbstowcs(wstr, str, wlen+1);
3518 if (wlen2 == (size_t)-1) {
3519 if (wstr != smallbuf)
3520 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003521 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 }
3523#ifdef HAVE_BROKEN_MBSTOWCS
3524 assert(wlen2 == wlen);
3525#endif
3526 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3527 if (wstr != smallbuf)
3528 PyMem_Free(wstr);
3529 }
3530 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003531
3532decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003533 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003534 errmsg = strerror(errno);
3535 assert(errmsg != NULL);
3536
3537 error_pos = mbstowcs_errorpos(str, len);
3538 if (errmsg != NULL) {
3539 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003540 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003541 if (wstr != NULL) {
3542 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003543 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003544 }
Victor Stinner2f197072011-12-17 07:08:30 +01003545 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003546 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003547 reason = PyUnicode_FromString(
3548 "mbstowcs() encountered an invalid multibyte sequence");
3549 if (reason == NULL)
3550 return NULL;
3551
3552 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3553 "locale", str, len,
3554 (Py_ssize_t)error_pos,
3555 (Py_ssize_t)(error_pos+1),
3556 reason);
3557 Py_DECREF(reason);
3558 if (exc != NULL) {
3559 PyCodec_StrictErrors(exc);
3560 Py_XDECREF(exc);
3561 }
3562 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003563}
3564
3565PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003566PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003567{
3568 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003569 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003570}
3571
3572
3573PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003574PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003576 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3577}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578
Christian Heimes5894ba72007-11-04 11:43:14 +00003579PyObject*
3580PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3581{
Victor Stinner99b95382011-07-04 14:23:54 +02003582#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003583 return PyUnicode_DecodeMBCS(s, size, NULL);
3584#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003585 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003586#else
Victor Stinner793b5312011-04-27 00:24:21 +02003587 PyInterpreterState *interp = PyThreadState_GET()->interp;
3588 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3589 cannot use it to encode and decode filenames before it is loaded. Load
3590 the Python codec requires to encode at least its own filename. Use the C
3591 version of the locale codec until the codec registry is initialized and
3592 the Python codec is loaded.
3593
3594 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3595 cannot only rely on it: check also interp->fscodec_initialized for
3596 subinterpreters. */
3597 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003598 return PyUnicode_Decode(s, size,
3599 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003600 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003601 }
3602 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003603 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003604 }
Victor Stinnerad158722010-10-27 00:25:46 +00003605#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003606}
3607
Martin v. Löwis011e8422009-05-05 04:43:17 +00003608
3609int
3610PyUnicode_FSConverter(PyObject* arg, void* addr)
3611{
3612 PyObject *output = NULL;
3613 Py_ssize_t size;
3614 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003615 if (arg == NULL) {
3616 Py_DECREF(*(PyObject**)addr);
3617 return 1;
3618 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003619 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 output = arg;
3621 Py_INCREF(output);
3622 }
3623 else {
3624 arg = PyUnicode_FromObject(arg);
3625 if (!arg)
3626 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003627 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003628 Py_DECREF(arg);
3629 if (!output)
3630 return 0;
3631 if (!PyBytes_Check(output)) {
3632 Py_DECREF(output);
3633 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3634 return 0;
3635 }
3636 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003637 size = PyBytes_GET_SIZE(output);
3638 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003639 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003640 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003641 Py_DECREF(output);
3642 return 0;
3643 }
3644 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003645 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003646}
3647
3648
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003649int
3650PyUnicode_FSDecoder(PyObject* arg, void* addr)
3651{
3652 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003653 if (arg == NULL) {
3654 Py_DECREF(*(PyObject**)addr);
3655 return 1;
3656 }
3657 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003658 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003659 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003660 output = arg;
3661 Py_INCREF(output);
3662 }
3663 else {
3664 arg = PyBytes_FromObject(arg);
3665 if (!arg)
3666 return 0;
3667 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3668 PyBytes_GET_SIZE(arg));
3669 Py_DECREF(arg);
3670 if (!output)
3671 return 0;
3672 if (!PyUnicode_Check(output)) {
3673 Py_DECREF(output);
3674 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3675 return 0;
3676 }
3677 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003678 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003679 Py_DECREF(output);
3680 return 0;
3681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003682 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003683 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003684 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003685 Py_DECREF(output);
3686 return 0;
3687 }
3688 *(PyObject**)addr = output;
3689 return Py_CLEANUP_SUPPORTED;
3690}
3691
3692
Martin v. Löwis5b222132007-06-10 09:51:05 +00003693char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003694PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003695{
Christian Heimesf3863112007-11-22 07:46:41 +00003696 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003697
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003698 if (!PyUnicode_Check(unicode)) {
3699 PyErr_BadArgument();
3700 return NULL;
3701 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003703 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003705 if (PyUnicode_UTF8(unicode) == NULL) {
3706 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3708 if (bytes == NULL)
3709 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003710 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3711 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003712 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 Py_DECREF(bytes);
3714 return NULL;
3715 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003716 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3717 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3718 PyBytes_AS_STRING(bytes),
3719 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720 Py_DECREF(bytes);
3721 }
3722
3723 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003724 *psize = PyUnicode_UTF8_LENGTH(unicode);
3725 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003726}
3727
3728char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3732}
3733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734Py_UNICODE *
3735PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 const unsigned char *one_byte;
3738#if SIZEOF_WCHAR_T == 4
3739 const Py_UCS2 *two_bytes;
3740#else
3741 const Py_UCS4 *four_bytes;
3742 const Py_UCS4 *ucs4_end;
3743 Py_ssize_t num_surrogates;
3744#endif
3745 wchar_t *w;
3746 wchar_t *wchar_end;
3747
3748 if (!PyUnicode_Check(unicode)) {
3749 PyErr_BadArgument();
3750 return NULL;
3751 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 assert(_PyUnicode_KIND(unicode) != 0);
3755 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3760 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 num_surrogates = 0;
3762
3763 for (; four_bytes < ucs4_end; ++four_bytes) {
3764 if (*four_bytes > 0xFFFF)
3765 ++num_surrogates;
3766 }
3767
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3769 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3770 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 PyErr_NoMemory();
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 w = _PyUnicode_WSTR(unicode);
3777 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3778 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3780 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003781 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003783 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3784 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 }
3786 else
3787 *w = *four_bytes;
3788
3789 if (w > wchar_end) {
3790 assert(0 && "Miscalculated string end");
3791 }
3792 }
3793 *w = 0;
3794#else
3795 /* sizeof(wchar_t) == 4 */
3796 Py_FatalError("Impossible unicode object state, wstr and str "
3797 "should share memory already.");
3798 return NULL;
3799#endif
3800 }
3801 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003802 if ((size_t)_PyUnicode_LENGTH(unicode) >
3803 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3804 PyErr_NoMemory();
3805 return NULL;
3806 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003807 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3808 (_PyUnicode_LENGTH(unicode) + 1));
3809 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810 PyErr_NoMemory();
3811 return NULL;
3812 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3814 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3815 w = _PyUnicode_WSTR(unicode);
3816 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3819 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 for (; w < wchar_end; ++one_byte, ++w)
3821 *w = *one_byte;
3822 /* null-terminate the wstr */
3823 *w = 0;
3824 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003825 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 for (; w < wchar_end; ++two_bytes, ++w)
3829 *w = *two_bytes;
3830 /* null-terminate the wstr */
3831 *w = 0;
3832#else
3833 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 PyObject_FREE(_PyUnicode_WSTR(unicode));
3835 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 Py_FatalError("Impossible unicode object state, wstr "
3837 "and str should share memory already.");
3838 return NULL;
3839#endif
3840 }
3841 else {
3842 assert(0 && "This should never happen.");
3843 }
3844 }
3845 }
3846 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003847 *size = PyUnicode_WSTR_LENGTH(unicode);
3848 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003849}
3850
Alexander Belopolsky40018472011-02-26 01:02:56 +00003851Py_UNICODE *
3852PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855}
3856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857
Alexander Belopolsky40018472011-02-26 01:02:56 +00003858Py_ssize_t
3859PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860{
3861 if (!PyUnicode_Check(unicode)) {
3862 PyErr_BadArgument();
3863 goto onError;
3864 }
3865 return PyUnicode_GET_SIZE(unicode);
3866
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 return -1;
3869}
3870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871Py_ssize_t
3872PyUnicode_GetLength(PyObject *unicode)
3873{
Victor Stinner07621332012-06-16 04:53:46 +02003874 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 PyErr_BadArgument();
3876 return -1;
3877 }
Victor Stinner07621332012-06-16 04:53:46 +02003878 if (PyUnicode_READY(unicode) == -1)
3879 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 return PyUnicode_GET_LENGTH(unicode);
3881}
3882
3883Py_UCS4
3884PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3885{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003886 void *data;
3887 int kind;
3888
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003889 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3890 PyErr_BadArgument();
3891 return (Py_UCS4)-1;
3892 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003893 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003894 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 return (Py_UCS4)-1;
3896 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003897 data = PyUnicode_DATA(unicode);
3898 kind = PyUnicode_KIND(unicode);
3899 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900}
3901
3902int
3903PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3904{
3905 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003906 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return -1;
3908 }
Victor Stinner488fa492011-12-12 00:01:39 +01003909 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003910 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003911 PyErr_SetString(PyExc_IndexError, "string index out of range");
3912 return -1;
3913 }
Victor Stinner488fa492011-12-12 00:01:39 +01003914 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003915 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003916 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3917 PyErr_SetString(PyExc_ValueError, "character out of range");
3918 return -1;
3919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3921 index, ch);
3922 return 0;
3923}
3924
Alexander Belopolsky40018472011-02-26 01:02:56 +00003925const char *
3926PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003927{
Victor Stinner42cb4622010-09-01 19:39:01 +00003928 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003929}
3930
Victor Stinner554f3f02010-06-16 23:33:54 +00003931/* create or adjust a UnicodeDecodeError */
3932static void
3933make_decode_exception(PyObject **exceptionObject,
3934 const char *encoding,
3935 const char *input, Py_ssize_t length,
3936 Py_ssize_t startpos, Py_ssize_t endpos,
3937 const char *reason)
3938{
3939 if (*exceptionObject == NULL) {
3940 *exceptionObject = PyUnicodeDecodeError_Create(
3941 encoding, input, length, startpos, endpos, reason);
3942 }
3943 else {
3944 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3945 goto onError;
3946 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3947 goto onError;
3948 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3949 goto onError;
3950 }
3951 return;
3952
3953onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003954 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003955}
3956
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003957#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958/* error handling callback helper:
3959 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003960 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 and adjust various state variables.
3962 return 0 on success, -1 on error
3963*/
3964
Alexander Belopolsky40018472011-02-26 01:02:56 +00003965static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003966unicode_decode_call_errorhandler_wchar(
3967 const char *errors, PyObject **errorHandler,
3968 const char *encoding, const char *reason,
3969 const char **input, const char **inend, Py_ssize_t *startinpos,
3970 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3971 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003973 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974
3975 PyObject *restuple = NULL;
3976 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003977 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003978 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t requiredsize;
3980 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003981 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003982 wchar_t *repwstr;
3983 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003985 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3986 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003989 *errorHandler = PyCodec_LookupError(errors);
3990 if (*errorHandler == NULL)
3991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 }
3993
Victor Stinner554f3f02010-06-16 23:33:54 +00003994 make_decode_exception(exceptionObject,
3995 encoding,
3996 *input, *inend - *input,
3997 *startinpos, *endinpos,
3998 reason);
3999 if (*exceptionObject == NULL)
4000 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001
4002 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4003 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004006 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 }
4009 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004011
4012 /* Copy back the bytes variables, which might have been modified by the
4013 callback */
4014 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4015 if (!inputobj)
4016 goto onError;
4017 if (!PyBytes_Check(inputobj)) {
4018 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4019 }
4020 *input = PyBytes_AS_STRING(inputobj);
4021 insize = PyBytes_GET_SIZE(inputobj);
4022 *inend = *input + insize;
4023 /* we can DECREF safely, as the exception has another reference,
4024 so the object won't go away. */
4025 Py_DECREF(inputobj);
4026
4027 if (newpos<0)
4028 newpos = insize+newpos;
4029 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004030 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004031 goto onError;
4032 }
4033
4034 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4035 if (repwstr == NULL)
4036 goto onError;
4037 /* need more space? (at least enough for what we
4038 have+the replacement+the rest of the string (starting
4039 at the new input position), so we won't have to check space
4040 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004041 requiredsize = *outpos;
4042 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4043 goto overflow;
4044 requiredsize += repwlen;
4045 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4046 goto overflow;
4047 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004048 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004049 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004050 requiredsize = 2*outsize;
4051 if (unicode_resize(output, requiredsize) < 0)
4052 goto onError;
4053 }
4054 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4055 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004056 *endinpos = newpos;
4057 *inptr = *input + newpos;
4058
4059 /* we made it! */
4060 Py_XDECREF(restuple);
4061 return 0;
4062
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004063 overflow:
4064 PyErr_SetString(PyExc_OverflowError,
4065 "decoded result is too long for a Python string");
4066
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004067 onError:
4068 Py_XDECREF(restuple);
4069 return -1;
4070}
4071#endif /* HAVE_MBCS */
4072
4073static int
4074unicode_decode_call_errorhandler_writer(
4075 const char *errors, PyObject **errorHandler,
4076 const char *encoding, const char *reason,
4077 const char **input, const char **inend, Py_ssize_t *startinpos,
4078 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4079 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4080{
4081 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4082
4083 PyObject *restuple = NULL;
4084 PyObject *repunicode = NULL;
4085 Py_ssize_t insize;
4086 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004087 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004088 PyObject *inputobj = NULL;
4089
4090 if (*errorHandler == NULL) {
4091 *errorHandler = PyCodec_LookupError(errors);
4092 if (*errorHandler == NULL)
4093 goto onError;
4094 }
4095
4096 make_decode_exception(exceptionObject,
4097 encoding,
4098 *input, *inend - *input,
4099 *startinpos, *endinpos,
4100 reason);
4101 if (*exceptionObject == NULL)
4102 goto onError;
4103
4104 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4105 if (restuple == NULL)
4106 goto onError;
4107 if (!PyTuple_Check(restuple)) {
4108 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4109 goto onError;
4110 }
4111 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004112 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004113
4114 /* Copy back the bytes variables, which might have been modified by the
4115 callback */
4116 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4117 if (!inputobj)
4118 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004119 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004121 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004122 *input = PyBytes_AS_STRING(inputobj);
4123 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004124 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004125 /* we can DECREF safely, as the exception has another reference,
4126 so the object won't go away. */
4127 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004131 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004132 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004134 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135
Victor Stinner8f674cc2013-04-17 23:02:17 +02004136 if (PyUnicode_READY(repunicode) < 0)
4137 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004138 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004139 if (replen > 1) {
4140 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004141 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004142 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4143 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4144 goto onError;
4145 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004146 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004147 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004153 Py_XDECREF(restuple);
4154 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004158 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159}
4160
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004161/* --- UTF-7 Codec -------------------------------------------------------- */
4162
Antoine Pitrou244651a2009-05-04 18:56:13 +00004163/* See RFC2152 for details. We encode conservatively and decode liberally. */
4164
4165/* Three simple macros defining base-64. */
4166
4167/* Is c a base-64 character? */
4168
4169#define IS_BASE64(c) \
4170 (((c) >= 'A' && (c) <= 'Z') || \
4171 ((c) >= 'a' && (c) <= 'z') || \
4172 ((c) >= '0' && (c) <= '9') || \
4173 (c) == '+' || (c) == '/')
4174
4175/* given that c is a base-64 character, what is its base-64 value? */
4176
4177#define FROM_BASE64(c) \
4178 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4179 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4180 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4181 (c) == '+' ? 62 : 63)
4182
4183/* What is the base-64 character of the bottom 6 bits of n? */
4184
4185#define TO_BASE64(n) \
4186 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4187
4188/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4189 * decoded as itself. We are permissive on decoding; the only ASCII
4190 * byte not decoding to itself is the + which begins a base64
4191 * string. */
4192
4193#define DECODE_DIRECT(c) \
4194 ((c) <= 127 && (c) != '+')
4195
4196/* The UTF-7 encoder treats ASCII characters differently according to
4197 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4198 * the above). See RFC2152. This array identifies these different
4199 * sets:
4200 * 0 : "Set D"
4201 * alphanumeric and '(),-./:?
4202 * 1 : "Set O"
4203 * !"#$%&*;<=>@[]^_`{|}
4204 * 2 : "whitespace"
4205 * ht nl cr sp
4206 * 3 : special (must be base64 encoded)
4207 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4208 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004209
Tim Petersced69f82003-09-16 20:30:58 +00004210static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211char utf7_category[128] = {
4212/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4213 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4214/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4215 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4216/* sp ! " # $ % & ' ( ) * + , - . / */
4217 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4218/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4220/* @ A B C D E F G H I J K L M N O */
4221 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4222/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4224/* ` a b c d e f g h i j k l m n o */
4225 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4226/* p q r s t u v w x y z { | } ~ del */
4227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004228};
4229
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230/* ENCODE_DIRECT: this character should be encoded as itself. The
4231 * answer depends on whether we are encoding set O as itself, and also
4232 * on whether we are encoding whitespace as itself. RFC2152 makes it
4233 * clear that the answers to these questions vary between
4234 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004235
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236#define ENCODE_DIRECT(c, directO, directWS) \
4237 ((c) < 128 && (c) > 0 && \
4238 ((utf7_category[(c)] == 0) || \
4239 (directWS && (utf7_category[(c)] == 2)) || \
4240 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241
Alexander Belopolsky40018472011-02-26 01:02:56 +00004242PyObject *
4243PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004244 Py_ssize_t size,
4245 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004246{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004247 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4248}
4249
Antoine Pitrou244651a2009-05-04 18:56:13 +00004250/* The decoder. The only state we preserve is our read position,
4251 * i.e. how many characters we have consumed. So if we end in the
4252 * middle of a shift sequence we have to back off the read position
4253 * and the output to the beginning of the sequence, otherwise we lose
4254 * all the shift state (seen bits, number of bits seen, high
4255 * surrogate). */
4256
Alexander Belopolsky40018472011-02-26 01:02:56 +00004257PyObject *
4258PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004259 Py_ssize_t size,
4260 const char *errors,
4261 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004262{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004264 Py_ssize_t startinpos;
4265 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004266 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004267 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 const char *errmsg = "";
4269 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004270 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004271 unsigned int base64bits = 0;
4272 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004273 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 PyObject *errorHandler = NULL;
4275 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004277 if (size == 0) {
4278 if (consumed)
4279 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004280 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004281 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004284 _PyUnicodeWriter_Init(&writer);
4285 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286
4287 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288 e = s + size;
4289
4290 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004291 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004293 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 if (inShift) { /* in a base-64 section */
4296 if (IS_BASE64(ch)) { /* consume a base-64 character */
4297 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4298 base64bits += 6;
4299 s++;
4300 if (base64bits >= 16) {
4301 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004302 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 base64bits -= 16;
4304 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004305 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 if (surrogate) {
4307 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004308 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4309 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004310 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004311 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004313 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 }
4315 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004316 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004317 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 }
Victor Stinner551ac952011-11-29 22:58:13 +01004321 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 /* first surrogate */
4323 surrogate = outCh;
4324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004326 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 }
4329 }
4330 }
4331 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 inShift = 0;
4333 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004335 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004336 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004337 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 if (base64bits > 0) { /* left-over bits */
4340 if (base64bits >= 6) {
4341 /* We've seen at least one base-64 character */
4342 errmsg = "partial character in shift sequence";
4343 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 else {
4346 /* Some bits remain; they should be zero */
4347 if (base64buffer != 0) {
4348 errmsg = "non-zero padding bits in shift sequence";
4349 goto utf7Error;
4350 }
4351 }
4352 }
4353 if (ch != '-') {
4354 /* '-' is absorbed; other terminating
4355 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004358 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359 }
4360 }
4361 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 s++; /* consume '+' */
4364 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004373 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 }
4375 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004378 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004379 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 else {
4382 startinpos = s-starts;
4383 s++;
4384 errmsg = "unexpected special character";
4385 goto utf7Error;
4386 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004390 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 errors, &errorHandler,
4392 "utf7", errmsg,
4393 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
4397
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 /* end of string */
4399
4400 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4401 /* if we're in an inconsistent state, that's an error */
4402 if (surrogate ||
4403 (base64bits >= 6) ||
4404 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 errors, &errorHandler,
4408 "utf7", "unterminated shift sequence",
4409 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 goto onError;
4412 if (s < e)
4413 goto restart;
4414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416
4417 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004418 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004420 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004421 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004422 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004423 writer.kind, writer.data, shiftOutStart);
4424 Py_XDECREF(errorHandler);
4425 Py_XDECREF(exc);
4426 _PyUnicodeWriter_Dealloc(&writer);
4427 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004428 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004429 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 }
4431 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004432 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004434 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 Py_XDECREF(errorHandler);
4437 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 Py_XDECREF(errorHandler);
4442 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 return NULL;
4445}
4446
4447
Alexander Belopolsky40018472011-02-26 01:02:56 +00004448PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004449_PyUnicode_EncodeUTF7(PyObject *str,
4450 int base64SetO,
4451 int base64WhiteSpace,
4452 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004454 int kind;
4455 void *data;
4456 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004457 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004459 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 unsigned int base64bits = 0;
4461 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 char * out;
4463 char * start;
4464
Benjamin Petersonbac79492012-01-14 13:34:47 -05004465 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004466 return NULL;
4467 kind = PyUnicode_KIND(str);
4468 data = PyUnicode_DATA(str);
4469 len = PyUnicode_GET_LENGTH(str);
4470
4471 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004474 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004475 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004476 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004477 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478 if (v == NULL)
4479 return NULL;
4480
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004481 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004482 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004483 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Antoine Pitrou244651a2009-05-04 18:56:13 +00004485 if (inShift) {
4486 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4487 /* shifting out */
4488 if (base64bits) { /* output remaining bits */
4489 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4490 base64buffer = 0;
4491 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
4493 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 /* Characters not in the BASE64 set implicitly unshift the sequence
4495 so no '-' is required, except if the character is itself a '-' */
4496 if (IS_BASE64(ch) || ch == '-') {
4497 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 *out++ = (char) ch;
4500 }
4501 else {
4502 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004503 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 else { /* not in a shift sequence */
4506 if (ch == '+') {
4507 *out++ = '+';
4508 *out++ = '-';
4509 }
4510 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4511 *out++ = (char) ch;
4512 }
4513 else {
4514 *out++ = '+';
4515 inShift = 1;
4516 goto encode_char;
4517 }
4518 }
4519 continue;
4520encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004522 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004523
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 /* code first surrogate */
4525 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004526 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 while (base64bits >= 6) {
4528 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4529 base64bits -= 6;
4530 }
4531 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004532 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 base64bits += 16;
4535 base64buffer = (base64buffer << 16) | ch;
4536 while (base64bits >= 6) {
4537 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4538 base64bits -= 6;
4539 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004540 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 if (base64bits)
4542 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4543 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004545 if (_PyBytes_Resize(&v, out - start) < 0)
4546 return NULL;
4547 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004549PyObject *
4550PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4551 Py_ssize_t size,
4552 int base64SetO,
4553 int base64WhiteSpace,
4554 const char *errors)
4555{
4556 PyObject *result;
4557 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4558 if (tmp == NULL)
4559 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004560 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004561 base64WhiteSpace, errors);
4562 Py_DECREF(tmp);
4563 return result;
4564}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566#undef IS_BASE64
4567#undef FROM_BASE64
4568#undef TO_BASE64
4569#undef DECODE_DIRECT
4570#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572/* --- UTF-8 Codec -------------------------------------------------------- */
4573
Alexander Belopolsky40018472011-02-26 01:02:56 +00004574PyObject *
4575PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004576 Py_ssize_t size,
4577 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578{
Walter Dörwald69652032004-09-07 20:24:22 +00004579 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4580}
4581
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004582#include "stringlib/asciilib.h"
4583#include "stringlib/codecs.h"
4584#include "stringlib/undef.h"
4585
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004586#include "stringlib/ucs1lib.h"
4587#include "stringlib/codecs.h"
4588#include "stringlib/undef.h"
4589
4590#include "stringlib/ucs2lib.h"
4591#include "stringlib/codecs.h"
4592#include "stringlib/undef.h"
4593
4594#include "stringlib/ucs4lib.h"
4595#include "stringlib/codecs.h"
4596#include "stringlib/undef.h"
4597
Antoine Pitrouab868312009-01-10 15:40:25 +00004598/* Mask to quickly check whether a C 'long' contains a
4599 non-ASCII, UTF8-encoded char. */
4600#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004601# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004602#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004603# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004604#else
4605# error C 'long' size should be either 4 or 8!
4606#endif
4607
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004608static Py_ssize_t
4609ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004610{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004611 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004612 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004614 /*
4615 * Issue #17237: m68k is a bit different from most architectures in
4616 * that objects do not use "natural alignment" - for example, int and
4617 * long are only aligned at 2-byte boundaries. Therefore the assert()
4618 * won't work; also, tests have shown that skipping the "optimised
4619 * version" will even speed up m68k.
4620 */
4621#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004623 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4624 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004625 /* Fast path, see in STRINGLIB(utf8_decode) for
4626 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004627 /* Help allocation */
4628 const char *_p = p;
4629 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004630 while (_p < aligned_end) {
4631 unsigned long value = *(const unsigned long *) _p;
4632 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004634 *((unsigned long *)q) = value;
4635 _p += SIZEOF_LONG;
4636 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004637 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004638 p = _p;
4639 while (p < end) {
4640 if ((unsigned char)*p & 0x80)
4641 break;
4642 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004646#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004647#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 while (p < end) {
4649 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4650 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004651 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004652 /* Help allocation */
4653 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 while (_p < aligned_end) {
4655 unsigned long value = *(unsigned long *) _p;
4656 if (value & ASCII_CHAR_MASK)
4657 break;
4658 _p += SIZEOF_LONG;
4659 }
4660 p = _p;
4661 if (_p == end)
4662 break;
4663 }
4664 if ((unsigned char)*p & 0x80)
4665 break;
4666 ++p;
4667 }
4668 memcpy(dest, start, p - start);
4669 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670}
Antoine Pitrouab868312009-01-10 15:40:25 +00004671
Victor Stinner785938e2011-12-11 20:09:03 +01004672PyObject *
4673PyUnicode_DecodeUTF8Stateful(const char *s,
4674 Py_ssize_t size,
4675 const char *errors,
4676 Py_ssize_t *consumed)
4677{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004679 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681
4682 Py_ssize_t startinpos;
4683 Py_ssize_t endinpos;
4684 const char *errmsg = "";
4685 PyObject *errorHandler = NULL;
4686 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004687
4688 if (size == 0) {
4689 if (consumed)
4690 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004691 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004692 }
4693
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4695 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004696 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697 *consumed = 1;
4698 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004699 }
4700
Victor Stinner8f674cc2013-04-17 23:02:17 +02004701 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004702 writer.min_length = size;
4703 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004704 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004705
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 writer.pos = ascii_decode(s, end, writer.data);
4707 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708 while (s < end) {
4709 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 if (PyUnicode_IS_ASCII(writer.buffer))
4713 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004715 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004717 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 } else {
4719 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 }
4722
4723 switch (ch) {
4724 case 0:
4725 if (s == end || consumed)
4726 goto End;
4727 errmsg = "unexpected end of data";
4728 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004729 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 break;
4731 case 1:
4732 errmsg = "invalid start byte";
4733 startinpos = s - starts;
4734 endinpos = startinpos + 1;
4735 break;
4736 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004737 case 3:
4738 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 errmsg = "invalid continuation byte";
4740 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004741 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 break;
4743 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004744 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 goto onError;
4746 continue;
4747 }
4748
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004749 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004750 errors, &errorHandler,
4751 "utf-8", errmsg,
4752 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004753 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004755 }
4756
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 if (consumed)
4759 *consumed = s - starts;
4760
4761 Py_XDECREF(errorHandler);
4762 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004763 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764
4765onError:
4766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004768 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004770}
4771
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004772#ifdef __APPLE__
4773
4774/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004775 used to decode the command line arguments on Mac OS X.
4776
4777 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004778 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004779
4780wchar_t*
4781_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4782{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004783 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 wchar_t *unicode;
4785 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004786
4787 /* Note: size will always be longer than the resulting Unicode
4788 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004789 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004790 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004791 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004792 if (!unicode)
4793 return NULL;
4794
4795 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004796 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004800#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004802#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004804#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004805 if (ch > 0xFF) {
4806#if SIZEOF_WCHAR_T == 4
4807 assert(0);
4808#else
4809 assert(Py_UNICODE_IS_SURROGATE(ch));
4810 /* compute and append the two surrogates: */
4811 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4812 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4813#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004814 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815 else {
4816 if (!ch && s == e)
4817 break;
4818 /* surrogateescape */
4819 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4820 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004822 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 return unicode;
4824}
4825
4826#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828/* Primary internal function which creates utf8 encoded bytes objects.
4829
4830 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004831 and allocate exactly as much space needed at the end. Else allocate the
4832 maximum possible needed (4 result bytes per Unicode character), and return
4833 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004834*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004835PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004836_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837{
Victor Stinner6099a032011-12-18 14:22:26 +01004838 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004839 void *data;
4840 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004842 if (!PyUnicode_Check(unicode)) {
4843 PyErr_BadArgument();
4844 return NULL;
4845 }
4846
4847 if (PyUnicode_READY(unicode) == -1)
4848 return NULL;
4849
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004850 if (PyUnicode_UTF8(unicode))
4851 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4852 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004853
4854 kind = PyUnicode_KIND(unicode);
4855 data = PyUnicode_DATA(unicode);
4856 size = PyUnicode_GET_LENGTH(unicode);
4857
Benjamin Petersonead6b532011-12-20 17:23:42 -06004858 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004859 default:
4860 assert(0);
4861 case PyUnicode_1BYTE_KIND:
4862 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4863 assert(!PyUnicode_IS_ASCII(unicode));
4864 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4865 case PyUnicode_2BYTE_KIND:
4866 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4867 case PyUnicode_4BYTE_KIND:
4868 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870}
4871
Alexander Belopolsky40018472011-02-26 01:02:56 +00004872PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4874 Py_ssize_t size,
4875 const char *errors)
4876{
4877 PyObject *v, *unicode;
4878
4879 unicode = PyUnicode_FromUnicode(s, size);
4880 if (unicode == NULL)
4881 return NULL;
4882 v = _PyUnicode_AsUTF8String(unicode, errors);
4883 Py_DECREF(unicode);
4884 return v;
4885}
4886
4887PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004888PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891}
4892
Walter Dörwald41980ca2007-08-16 21:55:45 +00004893/* --- UTF-32 Codec ------------------------------------------------------- */
4894
4895PyObject *
4896PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 Py_ssize_t size,
4898 const char *errors,
4899 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900{
4901 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4902}
4903
4904PyObject *
4905PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 Py_ssize_t size,
4907 const char *errors,
4908 int *byteorder,
4909 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910{
4911 const char *starts = s;
4912 Py_ssize_t startinpos;
4913 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004914 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004915 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004916 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004917 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004918 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004919 PyObject *errorHandler = NULL;
4920 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004921
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922 q = (unsigned char *)s;
4923 e = q + size;
4924
4925 if (byteorder)
4926 bo = *byteorder;
4927
4928 /* Check for BOM marks (U+FEFF) in the input and adjust current
4929 byte order setting accordingly. In native mode, the leading BOM
4930 mark is skipped, in all other modes, it is copied to the output
4931 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004932 if (bo == 0 && size >= 4) {
4933 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4934 if (bom == 0x0000FEFF) {
4935 bo = -1;
4936 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004938 else if (bom == 0xFFFE0000) {
4939 bo = 1;
4940 q += 4;
4941 }
4942 if (byteorder)
4943 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944 }
4945
Victor Stinnere64322e2012-10-30 23:12:47 +01004946 if (q == e) {
4947 if (consumed)
4948 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004949 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 }
4951
Victor Stinnere64322e2012-10-30 23:12:47 +01004952#ifdef WORDS_BIGENDIAN
4953 le = bo < 0;
4954#else
4955 le = bo <= 0;
4956#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004957 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004958
Victor Stinner8f674cc2013-04-17 23:02:17 +02004959 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004960 writer.min_length = (e - q + 3) / 4;
4961 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004962 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004963
Victor Stinnere64322e2012-10-30 23:12:47 +01004964 while (1) {
4965 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004967
Victor Stinnere64322e2012-10-30 23:12:47 +01004968 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 enum PyUnicode_Kind kind = writer.kind;
4970 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004971 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004972 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004973 if (le) {
4974 do {
4975 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4976 if (ch > maxch)
4977 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004978 if (kind != PyUnicode_1BYTE_KIND &&
4979 Py_UNICODE_IS_SURROGATE(ch))
4980 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004982 q += 4;
4983 } while (q <= last);
4984 }
4985 else {
4986 do {
4987 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4988 if (ch > maxch)
4989 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004990 if (kind != PyUnicode_1BYTE_KIND &&
4991 Py_UNICODE_IS_SURROGATE(ch))
4992 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004994 q += 4;
4995 } while (q <= last);
4996 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004998 }
4999
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005000 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005001 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005002 startinpos = ((const char *)q) - starts;
5003 endinpos = startinpos + 4;
5004 }
5005 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005010 startinpos = ((const char *)q) - starts;
5011 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 else {
5014 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005015 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 goto onError;
5017 q += 4;
5018 continue;
5019 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005020 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005021 startinpos = ((const char *)q) - starts;
5022 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005024
5025 /* The remaining input chars are ignored if the callback
5026 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005027 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005029 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005031 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005033 }
5034
Walter Dörwald41980ca2007-08-16 21:55:45 +00005035 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038 Py_XDECREF(errorHandler);
5039 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005040 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005043 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 Py_XDECREF(errorHandler);
5045 Py_XDECREF(exc);
5046 return NULL;
5047}
5048
5049PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005050_PyUnicode_EncodeUTF32(PyObject *str,
5051 const char *errors,
5052 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005054 enum PyUnicode_Kind kind;
5055 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005056 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005057 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005058 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005059#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005060 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005062 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005064 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005065 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005066 PyObject *errorHandler = NULL;
5067 PyObject *exc = NULL;
5068 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005070 if (!PyUnicode_Check(str)) {
5071 PyErr_BadArgument();
5072 return NULL;
5073 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005074 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005075 return NULL;
5076 kind = PyUnicode_KIND(str);
5077 data = PyUnicode_DATA(str);
5078 len = PyUnicode_GET_LENGTH(str);
5079
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005080 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005081 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005082 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005083 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 if (v == NULL)
5085 return NULL;
5086
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005087 /* output buffer is 4-bytes aligned */
5088 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5089 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005091 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005092 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005093 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005095 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005096 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005097 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005098 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005099 else
5100 encoding = "utf-32";
5101
5102 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005103 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5104 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 }
5106
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005107 pos = 0;
5108 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005109 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005110
5111 if (kind == PyUnicode_2BYTE_KIND) {
5112 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5113 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005114 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005115 else {
5116 assert(kind == PyUnicode_4BYTE_KIND);
5117 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5118 &out, native_ordering);
5119 }
5120 if (pos == len)
5121 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005122
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005123 rep = unicode_encode_call_errorhandler(
5124 errors, &errorHandler,
5125 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005126 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005127 if (!rep)
5128 goto error;
5129
5130 if (PyBytes_Check(rep)) {
5131 repsize = PyBytes_GET_SIZE(rep);
5132 if (repsize & 3) {
5133 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005134 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005135 "surrogates not allowed");
5136 goto error;
5137 }
5138 moreunits = repsize / 4;
5139 }
5140 else {
5141 assert(PyUnicode_Check(rep));
5142 if (PyUnicode_READY(rep) < 0)
5143 goto error;
5144 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5145 if (!PyUnicode_IS_ASCII(rep)) {
5146 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005147 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005148 "surrogates not allowed");
5149 goto error;
5150 }
5151 }
5152
5153 /* four bytes are reserved for each surrogate */
5154 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005155 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005156 Py_ssize_t morebytes = 4 * (moreunits - 1);
5157 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5158 /* integer overflow */
5159 PyErr_NoMemory();
5160 goto error;
5161 }
5162 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5163 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005164 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005165 }
5166
5167 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005168 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5169 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005170 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005171 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005172 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5173 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005174 }
5175
5176 Py_CLEAR(rep);
5177 }
5178
5179 /* Cut back to size actually needed. This is necessary for, for example,
5180 encoding of a string containing isolated surrogates and the 'ignore'
5181 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005182 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 if (nsize != PyBytes_GET_SIZE(v))
5184 _PyBytes_Resize(&v, nsize);
5185 Py_XDECREF(errorHandler);
5186 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005187 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005188 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005189 error:
5190 Py_XDECREF(rep);
5191 Py_XDECREF(errorHandler);
5192 Py_XDECREF(exc);
5193 Py_XDECREF(v);
5194 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005195}
5196
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005198PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5199 Py_ssize_t size,
5200 const char *errors,
5201 int byteorder)
5202{
5203 PyObject *result;
5204 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5205 if (tmp == NULL)
5206 return NULL;
5207 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5208 Py_DECREF(tmp);
5209 return result;
5210}
5211
5212PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005213PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214{
Victor Stinnerb960b342011-11-20 19:12:52 +01005215 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005216}
5217
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218/* --- UTF-16 Codec ------------------------------------------------------- */
5219
Tim Peters772747b2001-08-09 22:21:55 +00005220PyObject *
5221PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 const char *errors,
5224 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
Walter Dörwald69652032004-09-07 20:24:22 +00005226 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5227}
5228
5229PyObject *
5230PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_ssize_t size,
5232 const char *errors,
5233 int *byteorder,
5234 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t startinpos;
5238 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005239 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005240 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005241 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005242 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005243 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 PyObject *errorHandler = NULL;
5245 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005246 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
Tim Peters772747b2001-08-09 22:21:55 +00005248 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005249 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250
5251 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005252 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005254 /* Check for BOM marks (U+FEFF) in the input and adjust current
5255 byte order setting accordingly. In native mode, the leading BOM
5256 mark is skipped, in all other modes, it is copied to the output
5257 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005258 if (bo == 0 && size >= 2) {
5259 const Py_UCS4 bom = (q[1] << 8) | q[0];
5260 if (bom == 0xFEFF) {
5261 q += 2;
5262 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005264 else if (bom == 0xFFFE) {
5265 q += 2;
5266 bo = 1;
5267 }
5268 if (byteorder)
5269 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271
Antoine Pitrou63065d72012-05-15 23:48:04 +02005272 if (q == e) {
5273 if (consumed)
5274 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005275 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005276 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005277
Christian Heimes743e0cd2012-10-17 23:52:17 +02005278#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005279 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005280 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005281#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005283 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005284#endif
Tim Peters772747b2001-08-09 22:21:55 +00005285
Antoine Pitrou63065d72012-05-15 23:48:04 +02005286 /* Note: size will always be longer than the resulting Unicode
5287 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005288 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005289 writer.min_length = (e - q + 1) / 2;
5290 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005291 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005292
Antoine Pitrou63065d72012-05-15 23:48:04 +02005293 while (1) {
5294 Py_UCS4 ch = 0;
5295 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005297 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005298 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005299 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005301 native_ordering);
5302 else
5303 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005305 native_ordering);
5306 } else if (kind == PyUnicode_2BYTE_KIND) {
5307 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 native_ordering);
5310 } else {
5311 assert(kind == PyUnicode_4BYTE_KIND);
5312 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005313 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005314 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005315 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005317
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 switch (ch)
5319 {
5320 case 0:
5321 /* remaining byte at the end? (size should be even) */
5322 if (q == e || consumed)
5323 goto End;
5324 errmsg = "truncated data";
5325 startinpos = ((const char *)q) - starts;
5326 endinpos = ((const char *)e) - starts;
5327 break;
5328 /* The remaining input chars are ignored if the callback
5329 chooses to skip the input */
5330 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005331 q -= 2;
5332 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005333 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005334 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005335 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 endinpos = ((const char *)e) - starts;
5337 break;
5338 case 2:
5339 errmsg = "illegal encoding";
5340 startinpos = ((const char *)q) - 2 - starts;
5341 endinpos = startinpos + 2;
5342 break;
5343 case 3:
5344 errmsg = "illegal UTF-16 surrogate";
5345 startinpos = ((const char *)q) - 4 - starts;
5346 endinpos = startinpos + 2;
5347 break;
5348 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005349 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005350 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 continue;
5352 }
5353
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005355 errors,
5356 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005357 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005358 &starts,
5359 (const char **)&e,
5360 &startinpos,
5361 &endinpos,
5362 &exc,
5363 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 }
5367
Antoine Pitrou63065d72012-05-15 23:48:04 +02005368End:
Walter Dörwald69652032004-09-07 20:24:22 +00005369 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 Py_XDECREF(errorHandler);
5373 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005374 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005377 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 Py_XDECREF(errorHandler);
5379 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 return NULL;
5381}
5382
Tim Peters772747b2001-08-09 22:21:55 +00005383PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384_PyUnicode_EncodeUTF16(PyObject *str,
5385 const char *errors,
5386 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005388 enum PyUnicode_Kind kind;
5389 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005390 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005391 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005392 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005393 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005394#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005395 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005396#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005397 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005398#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 const char *encoding;
5400 Py_ssize_t nsize, pos;
5401 PyObject *errorHandler = NULL;
5402 PyObject *exc = NULL;
5403 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005405 if (!PyUnicode_Check(str)) {
5406 PyErr_BadArgument();
5407 return NULL;
5408 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005409 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005410 return NULL;
5411 kind = PyUnicode_KIND(str);
5412 data = PyUnicode_DATA(str);
5413 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005415 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005416 if (kind == PyUnicode_4BYTE_KIND) {
5417 const Py_UCS4 *in = (const Py_UCS4 *)data;
5418 const Py_UCS4 *end = in + len;
5419 while (in < end)
5420 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005421 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005422 }
5423 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 nsize = len + pairs + (byteorder == 0);
5426 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 if (v == NULL)
5428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005430 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005431 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005435 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005436 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005437
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 if (kind == PyUnicode_1BYTE_KIND) {
5439 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5440 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005441 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005442
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005443 if (byteorder < 0)
5444 encoding = "utf-16-le";
5445 else if (byteorder > 0)
5446 encoding = "utf-16-be";
5447 else
5448 encoding = "utf-16";
5449
5450 pos = 0;
5451 while (pos < len) {
5452 Py_ssize_t repsize, moreunits;
5453
5454 if (kind == PyUnicode_2BYTE_KIND) {
5455 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5456 &out, native_ordering);
5457 }
5458 else {
5459 assert(kind == PyUnicode_4BYTE_KIND);
5460 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5461 &out, native_ordering);
5462 }
5463 if (pos == len)
5464 break;
5465
5466 rep = unicode_encode_call_errorhandler(
5467 errors, &errorHandler,
5468 encoding, "surrogates not allowed",
5469 str, &exc, pos, pos + 1, &pos);
5470 if (!rep)
5471 goto error;
5472
5473 if (PyBytes_Check(rep)) {
5474 repsize = PyBytes_GET_SIZE(rep);
5475 if (repsize & 1) {
5476 raise_encode_exception(&exc, encoding,
5477 str, pos - 1, pos,
5478 "surrogates not allowed");
5479 goto error;
5480 }
5481 moreunits = repsize / 2;
5482 }
5483 else {
5484 assert(PyUnicode_Check(rep));
5485 if (PyUnicode_READY(rep) < 0)
5486 goto error;
5487 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5488 if (!PyUnicode_IS_ASCII(rep)) {
5489 raise_encode_exception(&exc, encoding,
5490 str, pos - 1, pos,
5491 "surrogates not allowed");
5492 goto error;
5493 }
5494 }
5495
5496 /* two bytes are reserved for each surrogate */
5497 if (moreunits > 1) {
5498 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5499 Py_ssize_t morebytes = 2 * (moreunits - 1);
5500 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5501 /* integer overflow */
5502 PyErr_NoMemory();
5503 goto error;
5504 }
5505 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5506 goto error;
5507 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5508 }
5509
5510 if (PyBytes_Check(rep)) {
5511 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5512 out += moreunits;
5513 } else /* rep is unicode */ {
5514 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5515 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5516 &out, native_ordering);
5517 }
5518
5519 Py_CLEAR(rep);
5520 }
5521
5522 /* Cut back to size actually needed. This is necessary for, for example,
5523 encoding of a string containing isolated surrogates and the 'ignore' handler
5524 is used. */
5525 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5526 if (nsize != PyBytes_GET_SIZE(v))
5527 _PyBytes_Resize(&v, nsize);
5528 Py_XDECREF(errorHandler);
5529 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005530 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005531 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005532 error:
5533 Py_XDECREF(rep);
5534 Py_XDECREF(errorHandler);
5535 Py_XDECREF(exc);
5536 Py_XDECREF(v);
5537 return NULL;
5538#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539}
5540
Alexander Belopolsky40018472011-02-26 01:02:56 +00005541PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005542PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5543 Py_ssize_t size,
5544 const char *errors,
5545 int byteorder)
5546{
5547 PyObject *result;
5548 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5549 if (tmp == NULL)
5550 return NULL;
5551 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5552 Py_DECREF(tmp);
5553 return result;
5554}
5555
5556PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005557PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005559 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560}
5561
5562/* --- Unicode Escape Codec ----------------------------------------------- */
5563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5565 if all the escapes in the string make it still a valid ASCII string.
5566 Returns -1 if any escapes were found which cause the string to
5567 pop out of ASCII range. Otherwise returns the length of the
5568 required buffer to hold the string.
5569 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005570static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5572{
5573 const unsigned char *p = (const unsigned char *)s;
5574 const unsigned char *end = p + size;
5575 Py_ssize_t length = 0;
5576
5577 if (size < 0)
5578 return -1;
5579
5580 for (; p < end; ++p) {
5581 if (*p > 127) {
5582 /* Non-ASCII */
5583 return -1;
5584 }
5585 else if (*p != '\\') {
5586 /* Normal character */
5587 ++length;
5588 }
5589 else {
5590 /* Backslash-escape, check next char */
5591 ++p;
5592 /* Escape sequence reaches till end of string or
5593 non-ASCII follow-up. */
5594 if (p >= end || *p > 127)
5595 return -1;
5596 switch (*p) {
5597 case '\n':
5598 /* backslash + \n result in zero characters */
5599 break;
5600 case '\\': case '\'': case '\"':
5601 case 'b': case 'f': case 't':
5602 case 'n': case 'r': case 'v': case 'a':
5603 ++length;
5604 break;
5605 case '0': case '1': case '2': case '3':
5606 case '4': case '5': case '6': case '7':
5607 case 'x': case 'u': case 'U': case 'N':
5608 /* these do not guarantee ASCII characters */
5609 return -1;
5610 default:
5611 /* count the backslash + the other character */
5612 length += 2;
5613 }
5614 }
5615 }
5616 return length;
5617}
5618
Fredrik Lundh06d12682001-01-24 07:59:11 +00005619static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005620
Alexander Belopolsky40018472011-02-26 01:02:56 +00005621PyObject *
5622PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005623 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005624 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005626 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005627 Py_ssize_t startinpos;
5628 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005631 char* message;
5632 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 PyObject *errorHandler = NULL;
5634 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005635 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005636
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005638 if (len == 0)
5639 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640
5641 /* After length_of_escaped_ascii_string() there are two alternatives,
5642 either the string is pure ASCII with named escapes like \n, etc.
5643 and we determined it's exact size (common case)
5644 or it contains \x, \u, ... escape sequences. then we create a
5645 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005646 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005647 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005648 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 }
5650 else {
5651 /* Escaped strings will always be longer than the resulting
5652 Unicode string, so we start with size here and then reduce the
5653 length after conversion to the true value.
5654 (but if the error callback returns a long replacement string
5655 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005656 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005657 }
5658
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005660 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005662
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 while (s < end) {
5664 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005665 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
5668 /* Non-escape characters are interpreted as Unicode ordinals */
5669 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005670 x = (unsigned char)*s;
5671 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005672 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 continue;
5675 }
5676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 /* \ - Escapes */
5679 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005680 c = *s++;
5681 if (s > end)
5682 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005683
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005684 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687#define WRITECHAR(ch) \
5688 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005689 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005690 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005692
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005694 case '\\': WRITECHAR('\\'); break;
5695 case '\'': WRITECHAR('\''); break;
5696 case '\"': WRITECHAR('\"'); break;
5697 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005698 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 case 'f': WRITECHAR('\014'); break;
5700 case 't': WRITECHAR('\t'); break;
5701 case 'n': WRITECHAR('\n'); break;
5702 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005704 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005705 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005706 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 case '0': case '1': case '2': case '3':
5710 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005711 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005712 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005713 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005714 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005715 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 break;
5719
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 /* hex escapes */
5721 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005723 digits = 2;
5724 message = "truncated \\xXX escape";
5725 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005729 digits = 4;
5730 message = "truncated \\uXXXX escape";
5731 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005734 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005735 digits = 8;
5736 message = "truncated \\UXXXXXXXX escape";
5737 hexescape:
5738 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005739 if (end - s < digits) {
5740 /* count only hex digits */
5741 for (; s < end; ++s) {
5742 c = (unsigned char)*s;
5743 if (!Py_ISXDIGIT(c))
5744 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005745 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005746 goto error;
5747 }
5748 for (; digits--; ++s) {
5749 c = (unsigned char)*s;
5750 if (!Py_ISXDIGIT(c))
5751 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005752 chr = (chr<<4) & ~0xF;
5753 if (c >= '0' && c <= '9')
5754 chr += c - '0';
5755 else if (c >= 'a' && c <= 'f')
5756 chr += 10 + c - 'a';
5757 else
5758 chr += 10 + c - 'A';
5759 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005760 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 /* _decoding_error will have already written into the
5762 target buffer. */
5763 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005765 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005766 message = "illegal Unicode character";
5767 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005768 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005769 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005770 break;
5771
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005773 case 'N':
5774 message = "malformed \\N character escape";
5775 if (ucnhash_CAPI == NULL) {
5776 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5778 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005779 if (ucnhash_CAPI == NULL)
5780 goto ucnhashError;
5781 }
5782 if (*s == '{') {
5783 const char *start = s+1;
5784 /* look for the closing brace */
5785 while (*s != '}' && s < end)
5786 s++;
5787 if (s > start && s < end && *s == '}') {
5788 /* found a name. look it up in the unicode database */
5789 message = "unknown Unicode character name";
5790 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005791 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005792 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005793 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005794 goto store;
5795 }
5796 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005797 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005798
5799 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005800 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 message = "\\ at end of string";
5802 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005803 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005804 }
5805 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005806 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005807 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005808 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005811 continue;
5812
5813 error:
5814 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005815 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005816 errors, &errorHandler,
5817 "unicodeescape", message,
5818 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005819 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005820 goto onError;
5821 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005823#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005825 Py_XDECREF(errorHandler);
5826 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005827 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005828
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005830 PyErr_SetString(
5831 PyExc_UnicodeError,
5832 "\\N escapes not supported (can't load unicodedata module)"
5833 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005834 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 Py_XDECREF(errorHandler);
5836 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005837 return NULL;
5838
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005840 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 Py_XDECREF(errorHandler);
5842 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 return NULL;
5844}
5845
5846/* Return a Unicode-Escape string version of the Unicode object.
5847
5848 If quotes is true, the string is enclosed in u"" or u'' quotes as
5849 appropriate.
5850
5851*/
5852
Alexander Belopolsky40018472011-02-26 01:02:56 +00005853PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005854PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005856 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005857 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005859 int kind;
5860 void *data;
5861 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862
Ezio Melottie7f90372012-10-05 03:33:31 +03005863 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005864 escape.
5865
Ezio Melottie7f90372012-10-05 03:33:31 +03005866 For UCS1 strings it's '\xxx', 4 bytes per source character.
5867 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5868 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005869 */
5870
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 if (!PyUnicode_Check(unicode)) {
5872 PyErr_BadArgument();
5873 return NULL;
5874 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005875 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005876 return NULL;
5877 len = PyUnicode_GET_LENGTH(unicode);
5878 kind = PyUnicode_KIND(unicode);
5879 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005880 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5882 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5883 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5884 }
5885
5886 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005887 return PyBytes_FromStringAndSize(NULL, 0);
5888
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005891
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005892 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 if (repr == NULL)
5897 return NULL;
5898
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005899 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005901 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005902 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005903
Walter Dörwald79e913e2007-05-12 11:08:06 +00005904 /* Escape backslashes */
5905 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 *p++ = '\\';
5907 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005908 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005909 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005910
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005911 /* Map 21-bit characters to '\U00xxxxxx' */
5912 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005913 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005914 *p++ = '\\';
5915 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005916 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5917 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5918 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5919 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5920 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5921 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5922 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5923 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005925 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005926
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005928 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 *p++ = '\\';
5930 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005931 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5932 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5933 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5934 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005936
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005937 /* Map special whitespace to '\t', \n', '\r' */
5938 else if (ch == '\t') {
5939 *p++ = '\\';
5940 *p++ = 't';
5941 }
5942 else if (ch == '\n') {
5943 *p++ = '\\';
5944 *p++ = 'n';
5945 }
5946 else if (ch == '\r') {
5947 *p++ = '\\';
5948 *p++ = 'r';
5949 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005950
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005951 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005952 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005954 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005955 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5956 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005957 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005958
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 /* Copy everything else as-is */
5960 else
5961 *p++ = (char) ch;
5962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005964 assert(p - PyBytes_AS_STRING(repr) > 0);
5965 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5966 return NULL;
5967 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968}
5969
Alexander Belopolsky40018472011-02-26 01:02:56 +00005970PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005971PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5972 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005974 PyObject *result;
5975 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5976 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005978 result = PyUnicode_AsUnicodeEscapeString(tmp);
5979 Py_DECREF(tmp);
5980 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981}
5982
5983/* --- Raw Unicode Escape Codec ------------------------------------------- */
5984
Alexander Belopolsky40018472011-02-26 01:02:56 +00005985PyObject *
5986PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005987 Py_ssize_t size,
5988 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005991 Py_ssize_t startinpos;
5992 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005993 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 const char *end;
5995 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 PyObject *errorHandler = NULL;
5997 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005998
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005999 if (size == 0)
6000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006001
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 /* Escaped strings will always be longer than the resulting
6003 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 length after conversion to the true value. (But decoding error
6005 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006006 _PyUnicodeWriter_Init(&writer);
6007 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006008
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 end = s + size;
6010 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 unsigned char c;
6012 Py_UCS4 x;
6013 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006014 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* Non-escape characters are interpreted as Unicode ordinals */
6017 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006018 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006019 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006020 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006022 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 startinpos = s-starts;
6024
6025 /* \u-escapes are only interpreted iff the number of leading
6026 backslashes if odd */
6027 bs = s;
6028 for (;s < end;) {
6029 if (*s != '\\')
6030 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006031 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006032 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006033 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 }
6035 if (((s - bs) & 1) == 0 ||
6036 s >= end ||
6037 (*s != 'u' && *s != 'U')) {
6038 continue;
6039 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006040 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 count = *s=='u' ? 4 : 8;
6042 s++;
6043
6044 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 for (x = 0, i = 0; i < count; ++i, ++s) {
6046 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006047 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006049 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 errors, &errorHandler,
6051 "rawunicodeescape", "truncated \\uXXXX",
6052 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 goto onError;
6055 goto nextByte;
6056 }
6057 x = (x<<4) & ~0xF;
6058 if (c >= '0' && c <= '9')
6059 x += c - '0';
6060 else if (c >= 'a' && c <= 'f')
6061 x += 10 + c - 'a';
6062 else
6063 x += 10 + c - 'A';
6064 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006065 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006066 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006067 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006068 }
6069 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006070 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006072 errors, &errorHandler,
6073 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006075 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006077 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 nextByte:
6079 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 Py_XDECREF(errorHandler);
6082 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006083 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006084
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006086 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 Py_XDECREF(errorHandler);
6088 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 return NULL;
6090}
6091
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006092
Alexander Belopolsky40018472011-02-26 01:02:56 +00006093PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006094PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006096 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 char *p;
6098 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006099 Py_ssize_t expandsize, pos;
6100 int kind;
6101 void *data;
6102 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006104 if (!PyUnicode_Check(unicode)) {
6105 PyErr_BadArgument();
6106 return NULL;
6107 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006108 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006109 return NULL;
6110 kind = PyUnicode_KIND(unicode);
6111 data = PyUnicode_DATA(unicode);
6112 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006113 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6114 bytes, and 1 byte characters 4. */
6115 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006116
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006117 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006120 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 if (repr == NULL)
6122 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006123 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006124 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006126 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 for (pos = 0; pos < len; pos++) {
6128 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 /* Map 32-bit characters to '\Uxxxxxxxx' */
6130 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006131 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006132 *p++ = '\\';
6133 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006134 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6135 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6136 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6137 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6138 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6139 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6140 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6141 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006142 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 *p++ = '\\';
6146 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006147 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6148 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6149 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6150 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 /* Copy everything else as-is */
6153 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 *p++ = (char) ch;
6155 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006156
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 assert(p > q);
6158 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006159 return NULL;
6160 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161}
6162
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6165 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 PyObject *result;
6168 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6169 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006170 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6172 Py_DECREF(tmp);
6173 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174}
6175
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006176/* --- Unicode Internal Codec ------------------------------------------- */
6177
Alexander Belopolsky40018472011-02-26 01:02:56 +00006178PyObject *
6179_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006180 Py_ssize_t size,
6181 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006182{
6183 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006184 Py_ssize_t startinpos;
6185 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006186 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006187 const char *end;
6188 const char *reason;
6189 PyObject *errorHandler = NULL;
6190 PyObject *exc = NULL;
6191
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006192 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006193 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006194 1))
6195 return NULL;
6196
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006197 if (size == 0)
6198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006199
Victor Stinner8f674cc2013-04-17 23:02:17 +02006200 _PyUnicodeWriter_Init(&writer);
6201 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6202 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006204 }
6205 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006206
Victor Stinner8f674cc2013-04-17 23:02:17 +02006207 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006208 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006209 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006210 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006211 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006212 endinpos = end-starts;
6213 reason = "truncated input";
6214 goto error;
6215 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006216 /* We copy the raw representation one byte at a time because the
6217 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006218 ((char *) &uch)[0] = s[0];
6219 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006220#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006221 ((char *) &uch)[2] = s[2];
6222 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006223#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006224 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006225#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006226 /* We have to sanity check the raw data, otherwise doom looms for
6227 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006228 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006229 endinpos = s - starts + Py_UNICODE_SIZE;
6230 reason = "illegal code point (> 0x10FFFF)";
6231 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006232 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006233#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006234 s += Py_UNICODE_SIZE;
6235#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006236 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006237 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006238 Py_UNICODE uch2;
6239 ((char *) &uch2)[0] = s[0];
6240 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006241 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006242 {
Victor Stinner551ac952011-11-29 22:58:13 +01006243 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006244 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245 }
6246 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006247#endif
6248
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006249 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006250 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006251 continue;
6252
6253 error:
6254 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006255 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006256 errors, &errorHandler,
6257 "unicode_internal", reason,
6258 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006259 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006260 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006261 }
6262
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006263 Py_XDECREF(errorHandler);
6264 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006265 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006268 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006269 Py_XDECREF(errorHandler);
6270 Py_XDECREF(exc);
6271 return NULL;
6272}
6273
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274/* --- Latin-1 Codec ------------------------------------------------------ */
6275
Alexander Belopolsky40018472011-02-26 01:02:56 +00006276PyObject *
6277PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006278 Py_ssize_t size,
6279 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006282 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283}
6284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006286static void
6287make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006288 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006289 PyObject *unicode,
6290 Py_ssize_t startpos, Py_ssize_t endpos,
6291 const char *reason)
6292{
6293 if (*exceptionObject == NULL) {
6294 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006296 encoding, unicode, startpos, endpos, reason);
6297 }
6298 else {
6299 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6300 goto onError;
6301 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6302 goto onError;
6303 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6304 goto onError;
6305 return;
6306 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006307 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006308 }
6309}
6310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006312static void
6313raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006314 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006315 PyObject *unicode,
6316 Py_ssize_t startpos, Py_ssize_t endpos,
6317 const char *reason)
6318{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006319 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006320 encoding, unicode, startpos, endpos, reason);
6321 if (*exceptionObject != NULL)
6322 PyCodec_StrictErrors(*exceptionObject);
6323}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324
6325/* error handling callback helper:
6326 build arguments, call the callback and check the arguments,
6327 put the result into newpos and return the replacement string, which
6328 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006329static PyObject *
6330unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006331 PyObject **errorHandler,
6332 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006333 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006334 Py_ssize_t startpos, Py_ssize_t endpos,
6335 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006337 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006338 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339 PyObject *restuple;
6340 PyObject *resunicode;
6341
6342 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346 }
6347
Benjamin Petersonbac79492012-01-14 13:34:47 -05006348 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006349 return NULL;
6350 len = PyUnicode_GET_LENGTH(unicode);
6351
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006352 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006353 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356
6357 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006362 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 Py_DECREF(restuple);
6364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006366 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 &resunicode, newpos)) {
6368 Py_DECREF(restuple);
6369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006371 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6372 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6373 Py_DECREF(restuple);
6374 return NULL;
6375 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006376 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 *newpos = len + *newpos;
6378 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006379 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 Py_DECREF(restuple);
6381 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 Py_INCREF(resunicode);
6384 Py_DECREF(restuple);
6385 return resunicode;
6386}
6387
Alexander Belopolsky40018472011-02-26 01:02:56 +00006388static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006390 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006391 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 /* input state */
6394 Py_ssize_t pos=0, size;
6395 int kind;
6396 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 /* output object */
6398 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 /* pointer into the output */
6400 char *str;
6401 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006403 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6404 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 PyObject *errorHandler = NULL;
6406 PyObject *exc = NULL;
6407 /* the following variable is used for caching string comparisons
6408 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6409 int known_errorHandler = -1;
6410
Benjamin Petersonbac79492012-01-14 13:34:47 -05006411 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 return NULL;
6413 size = PyUnicode_GET_LENGTH(unicode);
6414 kind = PyUnicode_KIND(unicode);
6415 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 /* allocate enough for a simple encoding without
6417 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006418 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006419 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006420 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006422 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006423 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 ressize = size;
6425
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006426 while (pos < size) {
6427 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 /* can we encode this? */
6430 if (c<limit) {
6431 /* no overflow check, because we know that the space is enough */
6432 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006433 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006434 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 Py_ssize_t requiredsize;
6437 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006438 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006440 Py_ssize_t collstart = pos;
6441 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006443 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 ++collend;
6445 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6446 if (known_errorHandler==-1) {
6447 if ((errors==NULL) || (!strcmp(errors, "strict")))
6448 known_errorHandler = 1;
6449 else if (!strcmp(errors, "replace"))
6450 known_errorHandler = 2;
6451 else if (!strcmp(errors, "ignore"))
6452 known_errorHandler = 3;
6453 else if (!strcmp(errors, "xmlcharrefreplace"))
6454 known_errorHandler = 4;
6455 else
6456 known_errorHandler = 0;
6457 }
6458 switch (known_errorHandler) {
6459 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006460 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 goto onError;
6462 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006463 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 *str++ = '?'; /* fall through */
6465 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006466 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 break;
6468 case 4: /* xmlcharrefreplace */
6469 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006470 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006471 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006472 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006474 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006476 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006477 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006478 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006480 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006483 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006484 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006486 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006487 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006488 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006489 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006490 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006491 if (requiredsize > PY_SSIZE_T_MAX - incr)
6492 goto overflow;
6493 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006495 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6496 goto overflow;
6497 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006499 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 requiredsize = 2*ressize;
6501 if (_PyBytes_Resize(&res, requiredsize))
6502 goto onError;
6503 str = PyBytes_AS_STRING(res) + respos;
6504 ressize = requiredsize;
6505 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 /* generate replacement */
6507 for (i = collstart; i < collend; ++i) {
6508 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 break;
6512 default:
6513 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 encoding, reason, unicode, &exc,
6515 collstart, collend, &newpos);
6516 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006517 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006519 if (PyBytes_Check(repunicode)) {
6520 /* Directly copy bytes result to output. */
6521 repsize = PyBytes_Size(repunicode);
6522 if (repsize > 1) {
6523 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006524 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006525 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6526 Py_DECREF(repunicode);
6527 goto overflow;
6528 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006529 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6530 Py_DECREF(repunicode);
6531 goto onError;
6532 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006533 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006534 ressize += repsize-1;
6535 }
6536 memcpy(str, PyBytes_AsString(repunicode), repsize);
6537 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006538 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006539 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006540 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006541 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 /* need more space? (at least enough for what we
6543 have+the replacement+the rest of the string, so
6544 we won't have to check space for encodable characters) */
6545 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006547 requiredsize = respos;
6548 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6549 goto overflow;
6550 requiredsize += repsize;
6551 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6552 goto overflow;
6553 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006555 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 requiredsize = 2*ressize;
6557 if (_PyBytes_Resize(&res, requiredsize)) {
6558 Py_DECREF(repunicode);
6559 goto onError;
6560 }
6561 str = PyBytes_AS_STRING(res) + respos;
6562 ressize = requiredsize;
6563 }
6564 /* check if there is anything unencodable in the replacement
6565 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566 for (i = 0; repsize-->0; ++i, ++str) {
6567 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006569 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006570 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 Py_DECREF(repunicode);
6572 goto onError;
6573 }
6574 *str = (char)c;
6575 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006577 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006578 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006579 }
6580 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006581 /* Resize if we allocated to much */
6582 size = str - PyBytes_AS_STRING(res);
6583 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006584 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006585 if (_PyBytes_Resize(&res, size) < 0)
6586 goto onError;
6587 }
6588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006589 Py_XDECREF(errorHandler);
6590 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006591 return res;
6592
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006593 overflow:
6594 PyErr_SetString(PyExc_OverflowError,
6595 "encoded result is too long for a Python string");
6596
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006597 onError:
6598 Py_XDECREF(res);
6599 Py_XDECREF(errorHandler);
6600 Py_XDECREF(exc);
6601 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006602}
6603
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006604/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605PyObject *
6606PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006607 Py_ssize_t size,
6608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610 PyObject *result;
6611 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6612 if (unicode == NULL)
6613 return NULL;
6614 result = unicode_encode_ucs1(unicode, errors, 256);
6615 Py_DECREF(unicode);
6616 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617}
6618
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006620_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621{
6622 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 PyErr_BadArgument();
6624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006626 if (PyUnicode_READY(unicode) == -1)
6627 return NULL;
6628 /* Fast path: if it is a one-byte string, construct
6629 bytes object directly. */
6630 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6631 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6632 PyUnicode_GET_LENGTH(unicode));
6633 /* Non-Latin-1 characters present. Defer to above function to
6634 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006635 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006636}
6637
6638PyObject*
6639PyUnicode_AsLatin1String(PyObject *unicode)
6640{
6641 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642}
6643
6644/* --- 7-bit ASCII Codec -------------------------------------------------- */
6645
Alexander Belopolsky40018472011-02-26 01:02:56 +00006646PyObject *
6647PyUnicode_DecodeASCII(const char *s,
6648 Py_ssize_t size,
6649 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006652 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006653 int kind;
6654 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006655 Py_ssize_t startinpos;
6656 Py_ssize_t endinpos;
6657 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 const char *e;
6659 PyObject *errorHandler = NULL;
6660 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006663 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006666 if (size == 1 && (unsigned char)s[0] < 128)
6667 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668
Victor Stinner8f674cc2013-04-17 23:02:17 +02006669 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006670 writer.min_length = size;
6671 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006672 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006675 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006676 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006677 writer.pos = outpos;
6678 if (writer.pos == size)
6679 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006680
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006681 s += writer.pos;
6682 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006684 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006686 PyUnicode_WRITE(kind, data, writer.pos, c);
6687 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 ++s;
6689 }
6690 else {
6691 startinpos = s-starts;
6692 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006693 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 errors, &errorHandler,
6695 "ascii", "ordinal not in range(128)",
6696 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006697 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006699 kind = writer.kind;
6700 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 Py_XDECREF(errorHandler);
6704 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006705 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006706
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006708 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_XDECREF(errorHandler);
6710 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 return NULL;
6712}
6713
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006715PyObject *
6716PyUnicode_EncodeASCII(const Py_UNICODE *p,
6717 Py_ssize_t size,
6718 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 PyObject *result;
6721 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6722 if (unicode == NULL)
6723 return NULL;
6724 result = unicode_encode_ucs1(unicode, errors, 128);
6725 Py_DECREF(unicode);
6726 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727}
6728
Alexander Belopolsky40018472011-02-26 01:02:56 +00006729PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006730_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731{
6732 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 PyErr_BadArgument();
6734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006736 if (PyUnicode_READY(unicode) == -1)
6737 return NULL;
6738 /* Fast path: if it is an ASCII-only string, construct bytes object
6739 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006740 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006741 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6742 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006743 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744}
6745
6746PyObject *
6747PyUnicode_AsASCIIString(PyObject *unicode)
6748{
6749 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750}
6751
Victor Stinner99b95382011-07-04 14:23:54 +02006752#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006753
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006754/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006755
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006756#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006757#define NEED_RETRY
6758#endif
6759
Victor Stinner3a50e702011-10-18 21:21:00 +02006760#ifndef WC_ERR_INVALID_CHARS
6761# define WC_ERR_INVALID_CHARS 0x0080
6762#endif
6763
6764static char*
6765code_page_name(UINT code_page, PyObject **obj)
6766{
6767 *obj = NULL;
6768 if (code_page == CP_ACP)
6769 return "mbcs";
6770 if (code_page == CP_UTF7)
6771 return "CP_UTF7";
6772 if (code_page == CP_UTF8)
6773 return "CP_UTF8";
6774
6775 *obj = PyBytes_FromFormat("cp%u", code_page);
6776 if (*obj == NULL)
6777 return NULL;
6778 return PyBytes_AS_STRING(*obj);
6779}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780
Victor Stinner3a50e702011-10-18 21:21:00 +02006781static DWORD
6782decode_code_page_flags(UINT code_page)
6783{
6784 if (code_page == CP_UTF7) {
6785 /* The CP_UTF7 decoder only supports flags=0 */
6786 return 0;
6787 }
6788 else
6789 return MB_ERR_INVALID_CHARS;
6790}
6791
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006792/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006793 * Decode a byte string from a Windows code page into unicode object in strict
6794 * mode.
6795 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006796 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6797 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006799static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006800decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006801 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006802 const char *in,
6803 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804{
Victor Stinner3a50e702011-10-18 21:21:00 +02006805 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006806 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006807 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006808
6809 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006810 assert(insize > 0);
6811 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6812 if (outsize <= 0)
6813 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814
6815 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006817 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006818 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 if (*v == NULL)
6820 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006821 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822 }
6823 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006825 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006826 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829 }
6830
6831 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6833 if (outsize <= 0)
6834 goto error;
6835 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006836
Victor Stinner3a50e702011-10-18 21:21:00 +02006837error:
6838 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6839 return -2;
6840 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006841 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842}
6843
Victor Stinner3a50e702011-10-18 21:21:00 +02006844/*
6845 * Decode a byte string from a code page into unicode object with an error
6846 * handler.
6847 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006848 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 * UnicodeDecodeError exception and returns -1 on error.
6850 */
6851static int
6852decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006853 PyObject **v,
6854 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006855 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006856{
6857 const char *startin = in;
6858 const char *endin = in + size;
6859 const DWORD flags = decode_code_page_flags(code_page);
6860 /* Ideally, we should get reason from FormatMessage. This is the Windows
6861 2000 English version of the message. */
6862 const char *reason = "No mapping for the Unicode character exists "
6863 "in the target code page.";
6864 /* each step cannot decode more than 1 character, but a character can be
6865 represented as a surrogate pair */
6866 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006867 int insize;
6868 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 PyObject *errorHandler = NULL;
6870 PyObject *exc = NULL;
6871 PyObject *encoding_obj = NULL;
6872 char *encoding;
6873 DWORD err;
6874 int ret = -1;
6875
6876 assert(size > 0);
6877
6878 encoding = code_page_name(code_page, &encoding_obj);
6879 if (encoding == NULL)
6880 return -1;
6881
Victor Stinner7d00cc12014-03-17 23:08:06 +01006882 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6884 UnicodeDecodeError. */
6885 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6886 if (exc != NULL) {
6887 PyCodec_StrictErrors(exc);
6888 Py_CLEAR(exc);
6889 }
6890 goto error;
6891 }
6892
6893 if (*v == NULL) {
6894 /* Create unicode object */
6895 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6896 PyErr_NoMemory();
6897 goto error;
6898 }
Victor Stinnerab595942011-12-17 04:59:06 +01006899 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 if (*v == NULL)
6902 goto error;
6903 startout = PyUnicode_AS_UNICODE(*v);
6904 }
6905 else {
6906 /* Extend unicode object */
6907 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6908 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6909 PyErr_NoMemory();
6910 goto error;
6911 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006912 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 goto error;
6914 startout = PyUnicode_AS_UNICODE(*v) + n;
6915 }
6916
6917 /* Decode the byte string character per character */
6918 out = startout;
6919 while (in < endin)
6920 {
6921 /* Decode a character */
6922 insize = 1;
6923 do
6924 {
6925 outsize = MultiByteToWideChar(code_page, flags,
6926 in, insize,
6927 buffer, Py_ARRAY_LENGTH(buffer));
6928 if (outsize > 0)
6929 break;
6930 err = GetLastError();
6931 if (err != ERROR_NO_UNICODE_TRANSLATION
6932 && err != ERROR_INSUFFICIENT_BUFFER)
6933 {
6934 PyErr_SetFromWindowsErr(0);
6935 goto error;
6936 }
6937 insize++;
6938 }
6939 /* 4=maximum length of a UTF-8 sequence */
6940 while (insize <= 4 && (in + insize) <= endin);
6941
6942 if (outsize <= 0) {
6943 Py_ssize_t startinpos, endinpos, outpos;
6944
Victor Stinner7d00cc12014-03-17 23:08:06 +01006945 /* last character in partial decode? */
6946 if (in + insize >= endin && !final)
6947 break;
6948
Victor Stinner3a50e702011-10-18 21:21:00 +02006949 startinpos = in - startin;
6950 endinpos = startinpos + 1;
6951 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006952 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006953 errors, &errorHandler,
6954 encoding, reason,
6955 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006956 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 {
6958 goto error;
6959 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006960 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 }
6962 else {
6963 in += insize;
6964 memcpy(out, buffer, outsize * sizeof(wchar_t));
6965 out += outsize;
6966 }
6967 }
6968
6969 /* write a NUL character at the end */
6970 *out = 0;
6971
6972 /* Extend unicode object */
6973 outsize = out - startout;
6974 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006975 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006976 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006977 /* (in - startin) <= size and size is an int */
6978 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006979
6980error:
6981 Py_XDECREF(encoding_obj);
6982 Py_XDECREF(errorHandler);
6983 Py_XDECREF(exc);
6984 return ret;
6985}
6986
Victor Stinner3a50e702011-10-18 21:21:00 +02006987static PyObject *
6988decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006989 const char *s, Py_ssize_t size,
6990 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006991{
Victor Stinner76a31a62011-11-04 00:05:13 +01006992 PyObject *v = NULL;
6993 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994
Victor Stinner3a50e702011-10-18 21:21:00 +02006995 if (code_page < 0) {
6996 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6997 return NULL;
6998 }
6999
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002
Victor Stinner76a31a62011-11-04 00:05:13 +01007003 do
7004 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007006 if (size > INT_MAX) {
7007 chunk_size = INT_MAX;
7008 final = 0;
7009 done = 0;
7010 }
7011 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007012#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007013 {
7014 chunk_size = (int)size;
7015 final = (consumed == NULL);
7016 done = 1;
7017 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007018
Victor Stinner76a31a62011-11-04 00:05:13 +01007019 if (chunk_size == 0 && done) {
7020 if (v != NULL)
7021 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007022 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007023 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
Victor Stinner76a31a62011-11-04 00:05:13 +01007025 converted = decode_code_page_strict(code_page, &v,
7026 s, chunk_size);
7027 if (converted == -2)
7028 converted = decode_code_page_errors(code_page, &v,
7029 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007030 errors, final);
7031 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007032
7033 if (converted < 0) {
7034 Py_XDECREF(v);
7035 return NULL;
7036 }
7037
7038 if (consumed)
7039 *consumed += converted;
7040
7041 s += converted;
7042 size -= converted;
7043 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007044
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007045 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007046}
7047
Alexander Belopolsky40018472011-02-26 01:02:56 +00007048PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007049PyUnicode_DecodeCodePageStateful(int code_page,
7050 const char *s,
7051 Py_ssize_t size,
7052 const char *errors,
7053 Py_ssize_t *consumed)
7054{
7055 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7056}
7057
7058PyObject *
7059PyUnicode_DecodeMBCSStateful(const char *s,
7060 Py_ssize_t size,
7061 const char *errors,
7062 Py_ssize_t *consumed)
7063{
7064 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7065}
7066
7067PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007068PyUnicode_DecodeMBCS(const char *s,
7069 Py_ssize_t size,
7070 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007071{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7073}
7074
Victor Stinner3a50e702011-10-18 21:21:00 +02007075static DWORD
7076encode_code_page_flags(UINT code_page, const char *errors)
7077{
7078 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007079 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 }
7081 else if (code_page == CP_UTF7) {
7082 /* CP_UTF7 only supports flags=0 */
7083 return 0;
7084 }
7085 else {
7086 if (errors != NULL && strcmp(errors, "replace") == 0)
7087 return 0;
7088 else
7089 return WC_NO_BEST_FIT_CHARS;
7090 }
7091}
7092
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007093/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 * Encode a Unicode string to a Windows code page into a byte string in strict
7095 * mode.
7096 *
7097 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007098 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007100static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007101encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007102 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104{
Victor Stinner554f3f02010-06-16 23:33:54 +00007105 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 BOOL *pusedDefaultChar = &usedDefaultChar;
7107 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007108 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007109 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007110 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 const DWORD flags = encode_code_page_flags(code_page, NULL);
7112 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007113 /* Create a substring so that we can get the UTF-16 representation
7114 of just the slice under consideration. */
7115 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116
Martin v. Löwis3d325192011-11-04 18:23:06 +01007117 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007118
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007120 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007122 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007123
Victor Stinner2fc507f2011-11-04 20:06:39 +01007124 substring = PyUnicode_Substring(unicode, offset, offset+len);
7125 if (substring == NULL)
7126 return -1;
7127 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7128 if (p == NULL) {
7129 Py_DECREF(substring);
7130 return -1;
7131 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007132 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007133
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007134 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007136 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 NULL, 0,
7138 NULL, pusedDefaultChar);
7139 if (outsize <= 0)
7140 goto error;
7141 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007142 if (pusedDefaultChar && *pusedDefaultChar) {
7143 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007145 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007146
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007150 if (*outbytes == NULL) {
7151 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007153 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 }
7156 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 const Py_ssize_t n = PyBytes_Size(*outbytes);
7159 if (outsize > PY_SSIZE_T_MAX - n) {
7160 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007161 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007164 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7165 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007167 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007169 }
7170
7171 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007173 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 out, outsize,
7175 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007176 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 if (outsize <= 0)
7178 goto error;
7179 if (pusedDefaultChar && *pusedDefaultChar)
7180 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007181 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007182
Victor Stinner3a50e702011-10-18 21:21:00 +02007183error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007184 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7186 return -2;
7187 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007188 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007189}
7190
Victor Stinner3a50e702011-10-18 21:21:00 +02007191/*
7192 * Encode a Unicode string to a Windows code page into a byte string using a
7193 * error handler.
7194 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007195 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 * -1 on other error.
7197 */
7198static int
7199encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007200 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007201 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007202{
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007204 Py_ssize_t pos = unicode_offset;
7205 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 /* Ideally, we should get reason from FormatMessage. This is the Windows
7207 2000 English version of the message. */
7208 const char *reason = "invalid character";
7209 /* 4=maximum length of a UTF-8 sequence */
7210 char buffer[4];
7211 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7212 Py_ssize_t outsize;
7213 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 PyObject *errorHandler = NULL;
7215 PyObject *exc = NULL;
7216 PyObject *encoding_obj = NULL;
7217 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 PyObject *rep;
7220 int ret = -1;
7221
7222 assert(insize > 0);
7223
7224 encoding = code_page_name(code_page, &encoding_obj);
7225 if (encoding == NULL)
7226 return -1;
7227
7228 if (errors == NULL || strcmp(errors, "strict") == 0) {
7229 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7230 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007231 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 if (exc != NULL) {
7233 PyCodec_StrictErrors(exc);
7234 Py_DECREF(exc);
7235 }
7236 Py_XDECREF(encoding_obj);
7237 return -1;
7238 }
7239
7240 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7241 pusedDefaultChar = &usedDefaultChar;
7242 else
7243 pusedDefaultChar = NULL;
7244
7245 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7246 PyErr_NoMemory();
7247 goto error;
7248 }
7249 outsize = insize * Py_ARRAY_LENGTH(buffer);
7250
7251 if (*outbytes == NULL) {
7252 /* Create string object */
7253 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7254 if (*outbytes == NULL)
7255 goto error;
7256 out = PyBytes_AS_STRING(*outbytes);
7257 }
7258 else {
7259 /* Extend string object */
7260 Py_ssize_t n = PyBytes_Size(*outbytes);
7261 if (n > PY_SSIZE_T_MAX - outsize) {
7262 PyErr_NoMemory();
7263 goto error;
7264 }
7265 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7266 goto error;
7267 out = PyBytes_AS_STRING(*outbytes) + n;
7268 }
7269
7270 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007271 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7274 wchar_t chars[2];
7275 int charsize;
7276 if (ch < 0x10000) {
7277 chars[0] = (wchar_t)ch;
7278 charsize = 1;
7279 }
7280 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007281 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7282 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007283 charsize = 2;
7284 }
7285
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007287 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 buffer, Py_ARRAY_LENGTH(buffer),
7289 NULL, pusedDefaultChar);
7290 if (outsize > 0) {
7291 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7292 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007293 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 memcpy(out, buffer, outsize);
7295 out += outsize;
7296 continue;
7297 }
7298 }
7299 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7300 PyErr_SetFromWindowsErr(0);
7301 goto error;
7302 }
7303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 rep = unicode_encode_call_errorhandler(
7305 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007306 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007307 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 if (rep == NULL)
7309 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007310 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007311
7312 if (PyBytes_Check(rep)) {
7313 outsize = PyBytes_GET_SIZE(rep);
7314 if (outsize != 1) {
7315 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7316 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7317 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7318 Py_DECREF(rep);
7319 goto error;
7320 }
7321 out = PyBytes_AS_STRING(*outbytes) + offset;
7322 }
7323 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7324 out += outsize;
7325 }
7326 else {
7327 Py_ssize_t i;
7328 enum PyUnicode_Kind kind;
7329 void *data;
7330
Benjamin Petersonbac79492012-01-14 13:34:47 -05007331 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 Py_DECREF(rep);
7333 goto error;
7334 }
7335
7336 outsize = PyUnicode_GET_LENGTH(rep);
7337 if (outsize != 1) {
7338 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7339 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7340 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7341 Py_DECREF(rep);
7342 goto error;
7343 }
7344 out = PyBytes_AS_STRING(*outbytes) + offset;
7345 }
7346 kind = PyUnicode_KIND(rep);
7347 data = PyUnicode_DATA(rep);
7348 for (i=0; i < outsize; i++) {
7349 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7350 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007351 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007352 encoding, unicode,
7353 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 "unable to encode error handler result to ASCII");
7355 Py_DECREF(rep);
7356 goto error;
7357 }
7358 *out = (unsigned char)ch;
7359 out++;
7360 }
7361 }
7362 Py_DECREF(rep);
7363 }
7364 /* write a NUL byte */
7365 *out = 0;
7366 outsize = out - PyBytes_AS_STRING(*outbytes);
7367 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7368 if (_PyBytes_Resize(outbytes, outsize) < 0)
7369 goto error;
7370 ret = 0;
7371
7372error:
7373 Py_XDECREF(encoding_obj);
7374 Py_XDECREF(errorHandler);
7375 Py_XDECREF(exc);
7376 return ret;
7377}
7378
Victor Stinner3a50e702011-10-18 21:21:00 +02007379static PyObject *
7380encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007381 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 const char *errors)
7383{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007384 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007386 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007387 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007388
Victor Stinner29dacf22015-01-26 16:41:32 +01007389 if (!PyUnicode_Check(unicode)) {
7390 PyErr_BadArgument();
7391 return NULL;
7392 }
7393
Benjamin Petersonbac79492012-01-14 13:34:47 -05007394 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007395 return NULL;
7396 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007397
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 if (code_page < 0) {
7399 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7400 return NULL;
7401 }
7402
Martin v. Löwis3d325192011-11-04 18:23:06 +01007403 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007404 return PyBytes_FromStringAndSize(NULL, 0);
7405
Victor Stinner7581cef2011-11-03 22:32:33 +01007406 offset = 0;
7407 do
7408 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007409#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007410 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007411 chunks. */
7412 if (len > INT_MAX/2) {
7413 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007414 done = 0;
7415 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007416 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007418 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007419 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007420 done = 1;
7421 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422
Victor Stinner76a31a62011-11-04 00:05:13 +01007423 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007424 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007425 errors);
7426 if (ret == -2)
7427 ret = encode_code_page_errors(code_page, &outbytes,
7428 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 if (ret < 0) {
7431 Py_XDECREF(outbytes);
7432 return NULL;
7433 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007434
Victor Stinner7581cef2011-11-03 22:32:33 +01007435 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007436 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007437 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 return outbytes;
7440}
7441
7442PyObject *
7443PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7444 Py_ssize_t size,
7445 const char *errors)
7446{
Victor Stinner7581cef2011-11-03 22:32:33 +01007447 PyObject *unicode, *res;
7448 unicode = PyUnicode_FromUnicode(p, size);
7449 if (unicode == NULL)
7450 return NULL;
7451 res = encode_code_page(CP_ACP, unicode, errors);
7452 Py_DECREF(unicode);
7453 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007454}
7455
7456PyObject *
7457PyUnicode_EncodeCodePage(int code_page,
7458 PyObject *unicode,
7459 const char *errors)
7460{
Victor Stinner7581cef2011-11-03 22:32:33 +01007461 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007462}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007463
Alexander Belopolsky40018472011-02-26 01:02:56 +00007464PyObject *
7465PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007466{
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007468}
7469
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470#undef NEED_RETRY
7471
Victor Stinner99b95382011-07-04 14:23:54 +02007472#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007473
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474/* --- Character Mapping Codec -------------------------------------------- */
7475
Victor Stinnerfb161b12013-04-18 01:44:27 +02007476static int
7477charmap_decode_string(const char *s,
7478 Py_ssize_t size,
7479 PyObject *mapping,
7480 const char *errors,
7481 _PyUnicodeWriter *writer)
7482{
7483 const char *starts = s;
7484 const char *e;
7485 Py_ssize_t startinpos, endinpos;
7486 PyObject *errorHandler = NULL, *exc = NULL;
7487 Py_ssize_t maplen;
7488 enum PyUnicode_Kind mapkind;
7489 void *mapdata;
7490 Py_UCS4 x;
7491 unsigned char ch;
7492
7493 if (PyUnicode_READY(mapping) == -1)
7494 return -1;
7495
7496 maplen = PyUnicode_GET_LENGTH(mapping);
7497 mapdata = PyUnicode_DATA(mapping);
7498 mapkind = PyUnicode_KIND(mapping);
7499
7500 e = s + size;
7501
7502 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7503 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7504 * is disabled in encoding aliases, latin1 is preferred because
7505 * its implementation is faster. */
7506 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7507 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7508 Py_UCS4 maxchar = writer->maxchar;
7509
7510 assert (writer->kind == PyUnicode_1BYTE_KIND);
7511 while (s < e) {
7512 ch = *s;
7513 x = mapdata_ucs1[ch];
7514 if (x > maxchar) {
7515 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7516 goto onError;
7517 maxchar = writer->maxchar;
7518 outdata = (Py_UCS1 *)writer->data;
7519 }
7520 outdata[writer->pos] = x;
7521 writer->pos++;
7522 ++s;
7523 }
7524 return 0;
7525 }
7526
7527 while (s < e) {
7528 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7529 enum PyUnicode_Kind outkind = writer->kind;
7530 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7531 if (outkind == PyUnicode_1BYTE_KIND) {
7532 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7533 Py_UCS4 maxchar = writer->maxchar;
7534 while (s < e) {
7535 ch = *s;
7536 x = mapdata_ucs2[ch];
7537 if (x > maxchar)
7538 goto Error;
7539 outdata[writer->pos] = x;
7540 writer->pos++;
7541 ++s;
7542 }
7543 break;
7544 }
7545 else if (outkind == PyUnicode_2BYTE_KIND) {
7546 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7547 while (s < e) {
7548 ch = *s;
7549 x = mapdata_ucs2[ch];
7550 if (x == 0xFFFE)
7551 goto Error;
7552 outdata[writer->pos] = x;
7553 writer->pos++;
7554 ++s;
7555 }
7556 break;
7557 }
7558 }
7559 ch = *s;
7560
7561 if (ch < maplen)
7562 x = PyUnicode_READ(mapkind, mapdata, ch);
7563 else
7564 x = 0xfffe; /* invalid value */
7565Error:
7566 if (x == 0xfffe)
7567 {
7568 /* undefined mapping */
7569 startinpos = s-starts;
7570 endinpos = startinpos+1;
7571 if (unicode_decode_call_errorhandler_writer(
7572 errors, &errorHandler,
7573 "charmap", "character maps to <undefined>",
7574 &starts, &e, &startinpos, &endinpos, &exc, &s,
7575 writer)) {
7576 goto onError;
7577 }
7578 continue;
7579 }
7580
7581 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7582 goto onError;
7583 ++s;
7584 }
7585 Py_XDECREF(errorHandler);
7586 Py_XDECREF(exc);
7587 return 0;
7588
7589onError:
7590 Py_XDECREF(errorHandler);
7591 Py_XDECREF(exc);
7592 return -1;
7593}
7594
7595static int
7596charmap_decode_mapping(const char *s,
7597 Py_ssize_t size,
7598 PyObject *mapping,
7599 const char *errors,
7600 _PyUnicodeWriter *writer)
7601{
7602 const char *starts = s;
7603 const char *e;
7604 Py_ssize_t startinpos, endinpos;
7605 PyObject *errorHandler = NULL, *exc = NULL;
7606 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007607 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007608
7609 e = s + size;
7610
7611 while (s < e) {
7612 ch = *s;
7613
7614 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7615 key = PyLong_FromLong((long)ch);
7616 if (key == NULL)
7617 goto onError;
7618
7619 item = PyObject_GetItem(mapping, key);
7620 Py_DECREF(key);
7621 if (item == NULL) {
7622 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7623 /* No mapping found means: mapping is undefined. */
7624 PyErr_Clear();
7625 goto Undefined;
7626 } else
7627 goto onError;
7628 }
7629
7630 /* Apply mapping */
7631 if (item == Py_None)
7632 goto Undefined;
7633 if (PyLong_Check(item)) {
7634 long value = PyLong_AS_LONG(item);
7635 if (value == 0xFFFE)
7636 goto Undefined;
7637 if (value < 0 || value > MAX_UNICODE) {
7638 PyErr_Format(PyExc_TypeError,
7639 "character mapping must be in range(0x%lx)",
7640 (unsigned long)MAX_UNICODE + 1);
7641 goto onError;
7642 }
7643
7644 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7645 goto onError;
7646 }
7647 else if (PyUnicode_Check(item)) {
7648 if (PyUnicode_READY(item) == -1)
7649 goto onError;
7650 if (PyUnicode_GET_LENGTH(item) == 1) {
7651 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7652 if (value == 0xFFFE)
7653 goto Undefined;
7654 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7655 goto onError;
7656 }
7657 else {
7658 writer->overallocate = 1;
7659 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7660 goto onError;
7661 }
7662 }
7663 else {
7664 /* wrong return value */
7665 PyErr_SetString(PyExc_TypeError,
7666 "character mapping must return integer, None or str");
7667 goto onError;
7668 }
7669 Py_CLEAR(item);
7670 ++s;
7671 continue;
7672
7673Undefined:
7674 /* undefined mapping */
7675 Py_CLEAR(item);
7676 startinpos = s-starts;
7677 endinpos = startinpos+1;
7678 if (unicode_decode_call_errorhandler_writer(
7679 errors, &errorHandler,
7680 "charmap", "character maps to <undefined>",
7681 &starts, &e, &startinpos, &endinpos, &exc, &s,
7682 writer)) {
7683 goto onError;
7684 }
7685 }
7686 Py_XDECREF(errorHandler);
7687 Py_XDECREF(exc);
7688 return 0;
7689
7690onError:
7691 Py_XDECREF(item);
7692 Py_XDECREF(errorHandler);
7693 Py_XDECREF(exc);
7694 return -1;
7695}
7696
Alexander Belopolsky40018472011-02-26 01:02:56 +00007697PyObject *
7698PyUnicode_DecodeCharmap(const char *s,
7699 Py_ssize_t size,
7700 PyObject *mapping,
7701 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007703 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007704
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 /* Default to Latin-1 */
7706 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007710 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007711 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007712 writer.min_length = size;
7713 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007715
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007716 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007717 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7718 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007719 }
7720 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007721 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007724 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007727 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 return NULL;
7729}
7730
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007731/* Charmap encoding: the lookup table */
7732
Alexander Belopolsky40018472011-02-26 01:02:56 +00007733struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 PyObject_HEAD
7735 unsigned char level1[32];
7736 int count2, count3;
7737 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007738};
7739
7740static PyObject*
7741encoding_map_size(PyObject *obj, PyObject* args)
7742{
7743 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007746}
7747
7748static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007749 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 PyDoc_STR("Return the size (in bytes) of this object") },
7751 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007752};
7753
7754static void
7755encoding_map_dealloc(PyObject* o)
7756{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007757 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758}
7759
7760static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007761 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 "EncodingMap", /*tp_name*/
7763 sizeof(struct encoding_map), /*tp_basicsize*/
7764 0, /*tp_itemsize*/
7765 /* methods */
7766 encoding_map_dealloc, /*tp_dealloc*/
7767 0, /*tp_print*/
7768 0, /*tp_getattr*/
7769 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007770 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 0, /*tp_repr*/
7772 0, /*tp_as_number*/
7773 0, /*tp_as_sequence*/
7774 0, /*tp_as_mapping*/
7775 0, /*tp_hash*/
7776 0, /*tp_call*/
7777 0, /*tp_str*/
7778 0, /*tp_getattro*/
7779 0, /*tp_setattro*/
7780 0, /*tp_as_buffer*/
7781 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7782 0, /*tp_doc*/
7783 0, /*tp_traverse*/
7784 0, /*tp_clear*/
7785 0, /*tp_richcompare*/
7786 0, /*tp_weaklistoffset*/
7787 0, /*tp_iter*/
7788 0, /*tp_iternext*/
7789 encoding_map_methods, /*tp_methods*/
7790 0, /*tp_members*/
7791 0, /*tp_getset*/
7792 0, /*tp_base*/
7793 0, /*tp_dict*/
7794 0, /*tp_descr_get*/
7795 0, /*tp_descr_set*/
7796 0, /*tp_dictoffset*/
7797 0, /*tp_init*/
7798 0, /*tp_alloc*/
7799 0, /*tp_new*/
7800 0, /*tp_free*/
7801 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007802};
7803
7804PyObject*
7805PyUnicode_BuildEncodingMap(PyObject* string)
7806{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807 PyObject *result;
7808 struct encoding_map *mresult;
7809 int i;
7810 int need_dict = 0;
7811 unsigned char level1[32];
7812 unsigned char level2[512];
7813 unsigned char *mlevel1, *mlevel2, *mlevel3;
7814 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007815 int kind;
7816 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007817 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007818 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007819
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007820 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007821 PyErr_BadArgument();
7822 return NULL;
7823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 kind = PyUnicode_KIND(string);
7825 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007826 length = PyUnicode_GET_LENGTH(string);
7827 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007828 memset(level1, 0xFF, sizeof level1);
7829 memset(level2, 0xFF, sizeof level2);
7830
7831 /* If there isn't a one-to-one mapping of NULL to \0,
7832 or if there are non-BMP characters, we need to use
7833 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007835 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007836 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 ch = PyUnicode_READ(kind, data, i);
7839 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007840 need_dict = 1;
7841 break;
7842 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007844 /* unmapped character */
7845 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 l1 = ch >> 11;
7847 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 if (level1[l1] == 0xFF)
7849 level1[l1] = count2++;
7850 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007851 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007852 }
7853
7854 if (count2 >= 0xFF || count3 >= 0xFF)
7855 need_dict = 1;
7856
7857 if (need_dict) {
7858 PyObject *result = PyDict_New();
7859 PyObject *key, *value;
7860 if (!result)
7861 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007862 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007864 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865 if (!key || !value)
7866 goto failed1;
7867 if (PyDict_SetItem(result, key, value) == -1)
7868 goto failed1;
7869 Py_DECREF(key);
7870 Py_DECREF(value);
7871 }
7872 return result;
7873 failed1:
7874 Py_XDECREF(key);
7875 Py_XDECREF(value);
7876 Py_DECREF(result);
7877 return NULL;
7878 }
7879
7880 /* Create a three-level trie */
7881 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7882 16*count2 + 128*count3 - 1);
7883 if (!result)
7884 return PyErr_NoMemory();
7885 PyObject_Init(result, &EncodingMapType);
7886 mresult = (struct encoding_map*)result;
7887 mresult->count2 = count2;
7888 mresult->count3 = count3;
7889 mlevel1 = mresult->level1;
7890 mlevel2 = mresult->level23;
7891 mlevel3 = mresult->level23 + 16*count2;
7892 memcpy(mlevel1, level1, 32);
7893 memset(mlevel2, 0xFF, 16*count2);
7894 memset(mlevel3, 0, 128*count3);
7895 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007896 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007898 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7899 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007900 /* unmapped character */
7901 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007902 o1 = ch>>11;
7903 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904 i2 = 16*mlevel1[o1] + o2;
7905 if (mlevel2[i2] == 0xFF)
7906 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007907 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 i3 = 128*mlevel2[i2] + o3;
7909 mlevel3[i3] = i;
7910 }
7911 return result;
7912}
7913
7914static int
Victor Stinner22168992011-11-20 17:09:18 +01007915encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916{
7917 struct encoding_map *map = (struct encoding_map*)mapping;
7918 int l1 = c>>11;
7919 int l2 = (c>>7) & 0xF;
7920 int l3 = c & 0x7F;
7921 int i;
7922
Victor Stinner22168992011-11-20 17:09:18 +01007923 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007925 if (c == 0)
7926 return 0;
7927 /* level 1*/
7928 i = map->level1[l1];
7929 if (i == 0xFF) {
7930 return -1;
7931 }
7932 /* level 2*/
7933 i = map->level23[16*i+l2];
7934 if (i == 0xFF) {
7935 return -1;
7936 }
7937 /* level 3 */
7938 i = map->level23[16*map->count2 + 128*i + l3];
7939 if (i == 0) {
7940 return -1;
7941 }
7942 return i;
7943}
7944
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007945/* Lookup the character ch in the mapping. If the character
7946 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007947 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007948static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007949charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950{
Christian Heimes217cfd12007-12-02 14:31:20 +00007951 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952 PyObject *x;
7953
7954 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956 x = PyObject_GetItem(mapping, w);
7957 Py_DECREF(w);
7958 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7960 /* No mapping found means: mapping is undefined. */
7961 PyErr_Clear();
7962 x = Py_None;
7963 Py_INCREF(x);
7964 return x;
7965 } else
7966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007968 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007970 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 long value = PyLong_AS_LONG(x);
7972 if (value < 0 || value > 255) {
7973 PyErr_SetString(PyExc_TypeError,
7974 "character mapping must be in range(256)");
7975 Py_DECREF(x);
7976 return NULL;
7977 }
7978 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007980 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 /* wrong return value */
7984 PyErr_Format(PyExc_TypeError,
7985 "character mapping must return integer, bytes or None, not %.400s",
7986 x->ob_type->tp_name);
7987 Py_DECREF(x);
7988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 }
7990}
7991
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007993charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007994{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007995 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7996 /* exponentially overallocate to minimize reallocations */
7997 if (requiredsize < 2*outsize)
7998 requiredsize = 2*outsize;
7999 if (_PyBytes_Resize(outobj, requiredsize))
8000 return -1;
8001 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002}
8003
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008006} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008008 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008009 space is available. Return a new reference to the object that
8010 was put in the output buffer, or Py_None, if the mapping was undefined
8011 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008012 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008013static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008014charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008015 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017 PyObject *rep;
8018 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008019 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020
Christian Heimes90aa7642007-12-19 02:45:37 +00008021 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008022 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024 if (res == -1)
8025 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 if (outsize<requiredsize)
8027 if (charmapencode_resize(outobj, outpos, requiredsize))
8028 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008029 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 outstart[(*outpos)++] = (char)res;
8031 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032 }
8033
8034 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008035 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 Py_DECREF(rep);
8039 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 if (PyLong_Check(rep)) {
8042 Py_ssize_t requiredsize = *outpos+1;
8043 if (outsize<requiredsize)
8044 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8045 Py_DECREF(rep);
8046 return enc_EXCEPTION;
8047 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008048 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 else {
8052 const char *repchars = PyBytes_AS_STRING(rep);
8053 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8054 Py_ssize_t requiredsize = *outpos+repsize;
8055 if (outsize<requiredsize)
8056 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8057 Py_DECREF(rep);
8058 return enc_EXCEPTION;
8059 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008060 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 memcpy(outstart + *outpos, repchars, repsize);
8062 *outpos += repsize;
8063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008064 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065 Py_DECREF(rep);
8066 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008067}
8068
8069/* handle an error in PyUnicode_EncodeCharmap
8070 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008071static int
8072charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008073 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008075 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008076 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077{
8078 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008079 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008080 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008081 enum PyUnicode_Kind kind;
8082 void *data;
8083 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008085 Py_ssize_t collstartpos = *inpos;
8086 Py_ssize_t collendpos = *inpos+1;
8087 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 char *encoding = "charmap";
8089 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008092 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093
Benjamin Petersonbac79492012-01-14 13:34:47 -05008094 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008095 return -1;
8096 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* find all unencodable characters */
8098 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008100 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008102 val = encoding_map_lookup(ch, mapping);
8103 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 break;
8105 ++collendpos;
8106 continue;
8107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008108
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008109 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8110 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 if (rep==NULL)
8112 return -1;
8113 else if (rep!=Py_None) {
8114 Py_DECREF(rep);
8115 break;
8116 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 }
8120 /* cache callback name lookup
8121 * (if not done yet, i.e. it's the first error) */
8122 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 if ((errors==NULL) || (!strcmp(errors, "strict")))
8124 *known_errorHandler = 1;
8125 else if (!strcmp(errors, "replace"))
8126 *known_errorHandler = 2;
8127 else if (!strcmp(errors, "ignore"))
8128 *known_errorHandler = 3;
8129 else if (!strcmp(errors, "xmlcharrefreplace"))
8130 *known_errorHandler = 4;
8131 else
8132 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 }
8134 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008135 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008136 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 return -1;
8138 case 2: /* replace */
8139 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 x = charmapencode_output('?', mapping, res, respos);
8141 if (x==enc_EXCEPTION) {
8142 return -1;
8143 }
8144 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008145 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 return -1;
8147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 }
8149 /* fall through */
8150 case 3: /* ignore */
8151 *inpos = collendpos;
8152 break;
8153 case 4: /* xmlcharrefreplace */
8154 /* generate replacement (temporarily (mis)uses p) */
8155 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 char buffer[2+29+1+1];
8157 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008158 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 for (cp = buffer; *cp; ++cp) {
8160 x = charmapencode_output(*cp, mapping, res, respos);
8161 if (x==enc_EXCEPTION)
8162 return -1;
8163 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008164 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 return -1;
8166 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 }
8168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 *inpos = collendpos;
8170 break;
8171 default:
8172 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008173 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008177 if (PyBytes_Check(repunicode)) {
8178 /* Directly copy bytes result to output. */
8179 Py_ssize_t outsize = PyBytes_Size(*res);
8180 Py_ssize_t requiredsize;
8181 repsize = PyBytes_Size(repunicode);
8182 requiredsize = *respos + repsize;
8183 if (requiredsize > outsize)
8184 /* Make room for all additional bytes. */
8185 if (charmapencode_resize(res, respos, requiredsize)) {
8186 Py_DECREF(repunicode);
8187 return -1;
8188 }
8189 memcpy(PyBytes_AsString(*res) + *respos,
8190 PyBytes_AsString(repunicode), repsize);
8191 *respos += repsize;
8192 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008193 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008194 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008195 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008197 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008198 Py_DECREF(repunicode);
8199 return -1;
8200 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008201 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008202 data = PyUnicode_DATA(repunicode);
8203 kind = PyUnicode_KIND(repunicode);
8204 for (index = 0; index < repsize; index++) {
8205 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8206 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008208 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return -1;
8210 }
8211 else if (x==enc_FAILED) {
8212 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008213 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 return -1;
8215 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008216 }
8217 *inpos = newpos;
8218 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219 }
8220 return 0;
8221}
8222
Alexander Belopolsky40018472011-02-26 01:02:56 +00008223PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008224_PyUnicode_EncodeCharmap(PyObject *unicode,
8225 PyObject *mapping,
8226 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228 /* output object */
8229 PyObject *res = NULL;
8230 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008231 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008233 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008234 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 PyObject *errorHandler = NULL;
8236 PyObject *exc = NULL;
8237 /* the following variable is used for caching string comparisons
8238 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8239 * 3=ignore, 4=xmlcharrefreplace */
8240 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008241 void *data;
8242 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243
Benjamin Petersonbac79492012-01-14 13:34:47 -05008244 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008245 return NULL;
8246 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008247 data = PyUnicode_DATA(unicode);
8248 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008249
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 /* Default to Latin-1 */
8251 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008252 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 /* allocate enough for a simple encoding without
8255 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008256 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 if (res == NULL)
8258 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008259 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008263 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008265 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 if (x==enc_EXCEPTION) /* error */
8267 goto onError;
8268 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008269 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 &exc,
8271 &known_errorHandler, &errorHandler, errors,
8272 &res, &respos)) {
8273 goto onError;
8274 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 else
8277 /* done with this character => adjust input position */
8278 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008282 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008283 if (_PyBytes_Resize(&res, respos) < 0)
8284 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 Py_XDECREF(exc);
8287 Py_XDECREF(errorHandler);
8288 return res;
8289
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 Py_XDECREF(res);
8292 Py_XDECREF(exc);
8293 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 return NULL;
8295}
8296
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297/* Deprecated */
8298PyObject *
8299PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8300 Py_ssize_t size,
8301 PyObject *mapping,
8302 const char *errors)
8303{
8304 PyObject *result;
8305 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8306 if (unicode == NULL)
8307 return NULL;
8308 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8309 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008310 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008311}
8312
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313PyObject *
8314PyUnicode_AsCharmapString(PyObject *unicode,
8315 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316{
8317 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 PyErr_BadArgument();
8319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008321 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322}
8323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325static void
8326make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328 Py_ssize_t startpos, Py_ssize_t endpos,
8329 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 *exceptionObject = _PyUnicodeTranslateError_Create(
8333 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 }
8335 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8337 goto onError;
8338 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8339 goto onError;
8340 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8341 goto onError;
8342 return;
8343 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008344 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 }
8346}
8347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348/* error handling callback helper:
8349 build arguments, call the callback and check the arguments,
8350 put the result into newpos and return the replacement string, which
8351 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352static PyObject *
8353unicode_translate_call_errorhandler(const char *errors,
8354 PyObject **errorHandler,
8355 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357 Py_ssize_t startpos, Py_ssize_t endpos,
8358 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008360 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008362 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 PyObject *restuple;
8364 PyObject *resunicode;
8365
8366 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 }
8371
8372 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376
8377 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008382 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 Py_DECREF(restuple);
8384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 }
8386 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 &resunicode, &i_newpos)) {
8388 Py_DECREF(restuple);
8389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008391 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008393 else
8394 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 Py_DECREF(restuple);
8398 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 Py_INCREF(resunicode);
8401 Py_DECREF(restuple);
8402 return resunicode;
8403}
8404
8405/* Lookup the character ch in the mapping and put the result in result,
8406 which must be decrefed by the caller.
8407 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008408static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410{
Christian Heimes217cfd12007-12-02 14:31:20 +00008411 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 PyObject *x;
8413
8414 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 x = PyObject_GetItem(mapping, w);
8417 Py_DECREF(w);
8418 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8420 /* No mapping found means: use 1:1 mapping. */
8421 PyErr_Clear();
8422 *result = NULL;
8423 return 0;
8424 } else
8425 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 }
8427 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 *result = x;
8429 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008431 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008433 if (value < 0 || value > MAX_UNICODE) {
8434 PyErr_Format(PyExc_ValueError,
8435 "character mapping must be in range(0x%x)",
8436 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 Py_DECREF(x);
8438 return -1;
8439 }
8440 *result = x;
8441 return 0;
8442 }
8443 else if (PyUnicode_Check(x)) {
8444 *result = x;
8445 return 0;
8446 }
8447 else {
8448 /* wrong return value */
8449 PyErr_SetString(PyExc_TypeError,
8450 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 Py_DECREF(x);
8452 return -1;
8453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008454}
Victor Stinner1194ea02014-04-04 19:37:40 +02008455
8456/* lookup the character, write the result into the writer.
8457 Return 1 if the result was written into the writer, return 0 if the mapping
8458 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008459static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008460charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8461 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462{
Victor Stinner1194ea02014-04-04 19:37:40 +02008463 PyObject *item;
8464
8465 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008467
8468 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008470 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008473 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008475
8476 if (item == Py_None) {
8477 Py_DECREF(item);
8478 return 0;
8479 }
8480
8481 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008482 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8483 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8484 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008485 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8486 Py_DECREF(item);
8487 return -1;
8488 }
8489 Py_DECREF(item);
8490 return 1;
8491 }
8492
8493 if (!PyUnicode_Check(item)) {
8494 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008496 }
8497
8498 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8499 Py_DECREF(item);
8500 return -1;
8501 }
8502
8503 Py_DECREF(item);
8504 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505}
8506
Victor Stinner89a76ab2014-04-05 11:44:04 +02008507static int
8508unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8509 Py_UCS1 *translate)
8510{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008511 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008512 int ret = 0;
8513
Victor Stinner89a76ab2014-04-05 11:44:04 +02008514 if (charmaptranslate_lookup(ch, mapping, &item)) {
8515 return -1;
8516 }
8517
8518 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008519 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008520 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008521 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008522 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008523 /* not found => default to 1:1 mapping */
8524 translate[ch] = ch;
8525 return 1;
8526 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008527 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008528 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008529 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8530 used it */
8531 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008532 /* invalid character or character outside ASCII:
8533 skip the fast translate */
8534 goto exit;
8535 }
8536 translate[ch] = (Py_UCS1)replace;
8537 }
8538 else if (PyUnicode_Check(item)) {
8539 Py_UCS4 replace;
8540
8541 if (PyUnicode_READY(item) == -1) {
8542 Py_DECREF(item);
8543 return -1;
8544 }
8545 if (PyUnicode_GET_LENGTH(item) != 1)
8546 goto exit;
8547
8548 replace = PyUnicode_READ_CHAR(item, 0);
8549 if (replace > 127)
8550 goto exit;
8551 translate[ch] = (Py_UCS1)replace;
8552 }
8553 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008554 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008555 goto exit;
8556 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008557 ret = 1;
8558
Benjamin Peterson1365de72014-04-07 20:15:41 -04008559 exit:
8560 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008561 return ret;
8562}
8563
8564/* Fast path for ascii => ascii translation. Return 1 if the whole string
8565 was translated into writer, return 0 if the input string was partially
8566 translated into writer, raise an exception and return -1 on error. */
8567static int
8568unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008569 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008570{
Victor Stinner872b2912014-04-05 14:27:07 +02008571 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008572 Py_ssize_t len;
8573 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008574 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008575
8576 if (PyUnicode_READY(input) == -1)
8577 return -1;
8578 if (!PyUnicode_IS_ASCII(input))
8579 return 0;
8580 len = PyUnicode_GET_LENGTH(input);
8581
Victor Stinner872b2912014-04-05 14:27:07 +02008582 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008583
8584 in = PyUnicode_1BYTE_DATA(input);
8585 end = in + len;
8586
8587 assert(PyUnicode_IS_ASCII(writer->buffer));
8588 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8589 out = PyUnicode_1BYTE_DATA(writer->buffer);
8590
Victor Stinner872b2912014-04-05 14:27:07 +02008591 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008592 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008593 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008594 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008595 int translate = unicode_fast_translate_lookup(mapping, ch,
8596 ascii_table);
8597 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008598 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008599 if (translate == 0)
8600 goto exit;
8601 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008602 }
Victor Stinner872b2912014-04-05 14:27:07 +02008603 if (ch2 == 0xfe) {
8604 if (ignore)
8605 continue;
8606 goto exit;
8607 }
8608 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008609 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008610 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008611 }
Victor Stinner872b2912014-04-05 14:27:07 +02008612 res = 1;
8613
8614exit:
8615 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8616 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008617}
8618
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620_PyUnicode_TranslateCharmap(PyObject *input,
8621 PyObject *mapping,
8622 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008625 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 Py_ssize_t size, i;
8627 int kind;
8628 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008629 _PyUnicodeWriter writer;
8630 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 char *reason = "character maps to <undefined>";
8632 PyObject *errorHandler = NULL;
8633 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008634 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008635 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 PyErr_BadArgument();
8639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 if (PyUnicode_READY(input) == -1)
8643 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008644 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 kind = PyUnicode_KIND(input);
8646 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647
8648 if (size == 0) {
8649 Py_INCREF(input);
8650 return input;
8651 }
8652
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 /* allocate enough for a simple 1:1 translation without
8654 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008655 _PyUnicodeWriter_Init(&writer);
8656 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658
Victor Stinner872b2912014-04-05 14:27:07 +02008659 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8660
8661 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008662 if (res < 0) {
8663 _PyUnicodeWriter_Dealloc(&writer);
8664 return NULL;
8665 }
8666 if (res == 1)
8667 return _PyUnicodeWriter_Finish(&writer);
8668
Victor Stinner89a76ab2014-04-05 11:44:04 +02008669 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008672 int translate;
8673 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8674 Py_ssize_t newpos;
8675 /* startpos for collecting untranslatable chars */
8676 Py_ssize_t collstart;
8677 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008678 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679
Victor Stinner1194ea02014-04-04 19:37:40 +02008680 ch = PyUnicode_READ(kind, data, i);
8681 translate = charmaptranslate_output(ch, mapping, &writer);
8682 if (translate < 0)
8683 goto onError;
8684
8685 if (translate != 0) {
8686 /* it worked => adjust input pointer */
8687 ++i;
8688 continue;
8689 }
8690
8691 /* untranslatable character */
8692 collstart = i;
8693 collend = i+1;
8694
8695 /* find all untranslatable characters */
8696 while (collend < size) {
8697 PyObject *x;
8698 ch = PyUnicode_READ(kind, data, collend);
8699 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008700 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008701 Py_XDECREF(x);
8702 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008704 ++collend;
8705 }
8706
8707 if (ignore) {
8708 i = collend;
8709 }
8710 else {
8711 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8712 reason, input, &exc,
8713 collstart, collend, &newpos);
8714 if (repunicode == NULL)
8715 goto onError;
8716 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008718 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008719 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008720 Py_DECREF(repunicode);
8721 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008722 }
8723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 Py_XDECREF(exc);
8725 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008726 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008729 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 Py_XDECREF(exc);
8731 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 return NULL;
8733}
8734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735/* Deprecated. Use PyUnicode_Translate instead. */
8736PyObject *
8737PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8738 Py_ssize_t size,
8739 PyObject *mapping,
8740 const char *errors)
8741{
Christian Heimes5f520f42012-09-11 14:03:25 +02008742 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8744 if (!unicode)
8745 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008746 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8747 Py_DECREF(unicode);
8748 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749}
8750
Alexander Belopolsky40018472011-02-26 01:02:56 +00008751PyObject *
8752PyUnicode_Translate(PyObject *str,
8753 PyObject *mapping,
8754 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755{
8756 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008757
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758 str = PyUnicode_FromObject(str);
8759 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008760 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 Py_DECREF(str);
8763 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764}
Tim Petersced69f82003-09-16 20:30:58 +00008765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008767fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768{
8769 /* No need to call PyUnicode_READY(self) because this function is only
8770 called as a callback from fixup() which does it already. */
8771 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8772 const int kind = PyUnicode_KIND(self);
8773 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008774 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008775 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 Py_ssize_t i;
8777
8778 for (i = 0; i < len; ++i) {
8779 ch = PyUnicode_READ(kind, data, i);
8780 fixed = 0;
8781 if (ch > 127) {
8782 if (Py_UNICODE_ISSPACE(ch))
8783 fixed = ' ';
8784 else {
8785 const int decimal = Py_UNICODE_TODECIMAL(ch);
8786 if (decimal >= 0)
8787 fixed = '0' + decimal;
8788 }
8789 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008790 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008791 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 PyUnicode_WRITE(kind, data, i, fixed);
8793 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008794 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008795 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 }
8798
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008799 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800}
8801
8802PyObject *
8803_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8804{
8805 if (!PyUnicode_Check(unicode)) {
8806 PyErr_BadInternalCall();
8807 return NULL;
8808 }
8809 if (PyUnicode_READY(unicode) == -1)
8810 return NULL;
8811 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8812 /* If the string is already ASCII, just return the same string */
8813 Py_INCREF(unicode);
8814 return unicode;
8815 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008816 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817}
8818
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008819PyObject *
8820PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8821 Py_ssize_t length)
8822{
Victor Stinnerf0124502011-11-21 23:12:56 +01008823 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008824 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008825 Py_UCS4 maxchar;
8826 enum PyUnicode_Kind kind;
8827 void *data;
8828
Victor Stinner99d7ad02012-02-22 13:37:39 +01008829 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008830 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008831 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008832 if (ch > 127) {
8833 int decimal = Py_UNICODE_TODECIMAL(ch);
8834 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008835 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008836 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008837 }
8838 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008839
8840 /* Copy to a new string */
8841 decimal = PyUnicode_New(length, maxchar);
8842 if (decimal == NULL)
8843 return decimal;
8844 kind = PyUnicode_KIND(decimal);
8845 data = PyUnicode_DATA(decimal);
8846 /* Iterate over code points */
8847 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008848 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008849 if (ch > 127) {
8850 int decimal = Py_UNICODE_TODECIMAL(ch);
8851 if (decimal >= 0)
8852 ch = '0' + decimal;
8853 }
8854 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008856 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008857}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008858/* --- Decimal Encoder ---------------------------------------------------- */
8859
Alexander Belopolsky40018472011-02-26 01:02:56 +00008860int
8861PyUnicode_EncodeDecimal(Py_UNICODE *s,
8862 Py_ssize_t length,
8863 char *output,
8864 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008865{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008866 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008867 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008868 enum PyUnicode_Kind kind;
8869 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008870
8871 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 PyErr_BadArgument();
8873 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008874 }
8875
Victor Stinner42bf7752011-11-21 22:52:58 +01008876 unicode = PyUnicode_FromUnicode(s, length);
8877 if (unicode == NULL)
8878 return -1;
8879
Benjamin Petersonbac79492012-01-14 13:34:47 -05008880 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008881 Py_DECREF(unicode);
8882 return -1;
8883 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008884 kind = PyUnicode_KIND(unicode);
8885 data = PyUnicode_DATA(unicode);
8886
Victor Stinnerb84d7232011-11-22 01:50:07 +01008887 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008888 PyObject *exc;
8889 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008891 Py_ssize_t startpos;
8892
8893 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008894
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008896 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008897 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008899 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 decimal = Py_UNICODE_TODECIMAL(ch);
8901 if (decimal >= 0) {
8902 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008903 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 continue;
8905 }
8906 if (0 < ch && ch < 256) {
8907 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008908 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 continue;
8910 }
Victor Stinner6345be92011-11-25 20:09:01 +01008911
Victor Stinner42bf7752011-11-21 22:52:58 +01008912 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008913 exc = NULL;
8914 raise_encode_exception(&exc, "decimal", unicode,
8915 startpos, startpos+1,
8916 "invalid decimal Unicode string");
8917 Py_XDECREF(exc);
8918 Py_DECREF(unicode);
8919 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008920 }
8921 /* 0-terminate the output string */
8922 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008923 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008924 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008925}
8926
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927/* --- Helpers ------------------------------------------------------------ */
8928
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008929/* helper macro to fixup start/end slice values */
8930#define ADJUST_INDICES(start, end, len) \
8931 if (end > len) \
8932 end = len; \
8933 else if (end < 0) { \
8934 end += len; \
8935 if (end < 0) \
8936 end = 0; \
8937 } \
8938 if (start < 0) { \
8939 start += len; \
8940 if (start < 0) \
8941 start = 0; \
8942 }
8943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008945any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 Py_ssize_t start,
8947 Py_ssize_t end)
8948{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008949 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 void *buf1, *buf2;
8951 Py_ssize_t len1, len2, result;
8952
8953 kind1 = PyUnicode_KIND(s1);
8954 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008955 if (kind1 < kind2)
8956 return -1;
8957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 len1 = PyUnicode_GET_LENGTH(s1);
8959 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008960 ADJUST_INDICES(start, end, len1);
8961 if (end - start < len2)
8962 return -1;
8963
8964 buf1 = PyUnicode_DATA(s1);
8965 buf2 = PyUnicode_DATA(s2);
8966 if (len2 == 1) {
8967 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8968 result = findchar((const char *)buf1 + kind1*start,
8969 kind1, end - start, ch, direction);
8970 if (result == -1)
8971 return -1;
8972 else
8973 return start + result;
8974 }
8975
8976 if (kind2 != kind1) {
8977 buf2 = _PyUnicode_AsKind(s2, kind1);
8978 if (!buf2)
8979 return -2;
8980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981
Victor Stinner794d5672011-10-10 03:21:36 +02008982 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008983 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02008984 case PyUnicode_1BYTE_KIND:
8985 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8986 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8987 else
8988 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8989 break;
8990 case PyUnicode_2BYTE_KIND:
8991 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8992 break;
8993 case PyUnicode_4BYTE_KIND:
8994 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8995 break;
8996 default:
8997 assert(0); result = -2;
8998 }
8999 }
9000 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009001 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009002 case PyUnicode_1BYTE_KIND:
9003 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9004 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9005 else
9006 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9007 break;
9008 case PyUnicode_2BYTE_KIND:
9009 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9010 break;
9011 case PyUnicode_4BYTE_KIND:
9012 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9013 break;
9014 default:
9015 assert(0); result = -2;
9016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 }
9018
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009019 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 PyMem_Free(buf2);
9021
9022 return result;
9023}
9024
9025Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009026_PyUnicode_InsertThousandsGrouping(
9027 PyObject *unicode, Py_ssize_t index,
9028 Py_ssize_t n_buffer,
9029 void *digits, Py_ssize_t n_digits,
9030 Py_ssize_t min_width,
9031 const char *grouping, PyObject *thousands_sep,
9032 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033{
Victor Stinner41a863c2012-02-24 00:37:51 +01009034 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009035 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009036 Py_ssize_t thousands_sep_len;
9037 Py_ssize_t len;
9038
9039 if (unicode != NULL) {
9040 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009041 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009042 }
9043 else {
9044 kind = PyUnicode_1BYTE_KIND;
9045 data = NULL;
9046 }
9047 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9048 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9049 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9050 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009051 if (thousands_sep_kind < kind) {
9052 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9053 if (!thousands_sep_data)
9054 return -1;
9055 }
9056 else {
9057 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9058 if (!data)
9059 return -1;
9060 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009061 }
9062
Benjamin Petersonead6b532011-12-20 17:23:42 -06009063 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009065 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009066 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009067 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009068 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009069 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009070 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009071 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009072 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009073 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009074 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009075 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009077 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009078 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009079 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009080 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009081 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009083 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009084 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009085 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009086 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 break;
9088 default:
9089 assert(0);
9090 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009092 if (unicode != NULL && thousands_sep_kind != kind) {
9093 if (thousands_sep_kind < kind)
9094 PyMem_Free(thousands_sep_data);
9095 else
9096 PyMem_Free(data);
9097 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009098 if (unicode == NULL) {
9099 *maxchar = 127;
9100 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009101 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009102 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009103 }
9104 }
9105 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106}
9107
9108
Alexander Belopolsky40018472011-02-26 01:02:56 +00009109Py_ssize_t
9110PyUnicode_Count(PyObject *str,
9111 PyObject *substr,
9112 Py_ssize_t start,
9113 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009115 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009116 PyObject* str_obj;
9117 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009118 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 void *buf1 = NULL, *buf2 = NULL;
9120 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009121
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009122 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009123 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009125 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009126 if (!sub_obj) {
9127 Py_DECREF(str_obj);
9128 return -1;
9129 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009130 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009131 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 Py_DECREF(str_obj);
9133 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 }
Tim Petersced69f82003-09-16 20:30:58 +00009135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 kind1 = PyUnicode_KIND(str_obj);
9137 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009138 if (kind1 < kind2) {
9139 Py_DECREF(sub_obj);
9140 Py_DECREF(str_obj);
9141 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009142 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 len1 = PyUnicode_GET_LENGTH(str_obj);
9145 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009147 if (end - start < len2) {
9148 Py_DECREF(sub_obj);
9149 Py_DECREF(str_obj);
9150 return 0;
9151 }
9152
9153 buf1 = PyUnicode_DATA(str_obj);
9154 buf2 = PyUnicode_DATA(sub_obj);
9155 if (kind2 != kind1) {
9156 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9157 if (!buf2)
9158 goto onError;
9159 }
9160
9161 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009163 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9164 result = asciilib_count(
9165 ((Py_UCS1*)buf1) + start, end - start,
9166 buf2, len2, PY_SSIZE_T_MAX
9167 );
9168 else
9169 result = ucs1lib_count(
9170 ((Py_UCS1*)buf1) + start, end - start,
9171 buf2, len2, PY_SSIZE_T_MAX
9172 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 break;
9174 case PyUnicode_2BYTE_KIND:
9175 result = ucs2lib_count(
9176 ((Py_UCS2*)buf1) + start, end - start,
9177 buf2, len2, PY_SSIZE_T_MAX
9178 );
9179 break;
9180 case PyUnicode_4BYTE_KIND:
9181 result = ucs4lib_count(
9182 ((Py_UCS4*)buf1) + start, end - start,
9183 buf2, len2, PY_SSIZE_T_MAX
9184 );
9185 break;
9186 default:
9187 assert(0); result = 0;
9188 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189
9190 Py_DECREF(sub_obj);
9191 Py_DECREF(str_obj);
9192
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009193 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 PyMem_Free(buf2);
9195
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 onError:
9198 Py_DECREF(sub_obj);
9199 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009200 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 PyMem_Free(buf2);
9202 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203}
9204
Alexander Belopolsky40018472011-02-26 01:02:56 +00009205Py_ssize_t
9206PyUnicode_Find(PyObject *str,
9207 PyObject *sub,
9208 Py_ssize_t start,
9209 Py_ssize_t end,
9210 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009212 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009215 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009217 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009218 if (!sub) {
9219 Py_DECREF(str);
9220 return -2;
9221 }
9222 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9223 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 Py_DECREF(str);
9225 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 }
Tim Petersced69f82003-09-16 20:30:58 +00009227
Victor Stinner794d5672011-10-10 03:21:36 +02009228 result = any_find_slice(direction,
9229 str, sub, start, end
9230 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009231
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009233 Py_DECREF(sub);
9234
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 return result;
9236}
9237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238Py_ssize_t
9239PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9240 Py_ssize_t start, Py_ssize_t end,
9241 int direction)
9242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009244 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 if (PyUnicode_READY(str) == -1)
9246 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009247 if (start < 0 || end < 0) {
9248 PyErr_SetString(PyExc_IndexError, "string index out of range");
9249 return -2;
9250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 if (end > PyUnicode_GET_LENGTH(str))
9252 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009253 if (start >= end)
9254 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009256 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9257 kind, end-start, ch, direction);
9258 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009260 else
9261 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262}
9263
Alexander Belopolsky40018472011-02-26 01:02:56 +00009264static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009265tailmatch(PyObject *self,
9266 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009267 Py_ssize_t start,
9268 Py_ssize_t end,
9269 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 int kind_self;
9272 int kind_sub;
9273 void *data_self;
9274 void *data_sub;
9275 Py_ssize_t offset;
9276 Py_ssize_t i;
9277 Py_ssize_t end_sub;
9278
9279 if (PyUnicode_READY(self) == -1 ||
9280 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009281 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9284 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009288 if (PyUnicode_GET_LENGTH(substring) == 0)
9289 return 1;
9290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 kind_self = PyUnicode_KIND(self);
9292 data_self = PyUnicode_DATA(self);
9293 kind_sub = PyUnicode_KIND(substring);
9294 data_sub = PyUnicode_DATA(substring);
9295 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9296
9297 if (direction > 0)
9298 offset = end;
9299 else
9300 offset = start;
9301
9302 if (PyUnicode_READ(kind_self, data_self, offset) ==
9303 PyUnicode_READ(kind_sub, data_sub, 0) &&
9304 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9305 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9306 /* If both are of the same kind, memcmp is sufficient */
9307 if (kind_self == kind_sub) {
9308 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009309 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 data_sub,
9311 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009312 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314 /* otherwise we have to compare each character by first accesing it */
9315 else {
9316 /* We do not need to compare 0 and len(substring)-1 because
9317 the if statement above ensured already that they are equal
9318 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 for (i = 1; i < end_sub; ++i) {
9320 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9321 PyUnicode_READ(kind_sub, data_sub, i))
9322 return 0;
9323 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009324 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 }
9327
9328 return 0;
9329}
9330
Alexander Belopolsky40018472011-02-26 01:02:56 +00009331Py_ssize_t
9332PyUnicode_Tailmatch(PyObject *str,
9333 PyObject *substr,
9334 Py_ssize_t start,
9335 Py_ssize_t end,
9336 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009338 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009339
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 str = PyUnicode_FromObject(str);
9341 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343 substr = PyUnicode_FromObject(substr);
9344 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 Py_DECREF(str);
9346 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 }
Tim Petersced69f82003-09-16 20:30:58 +00009348
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009349 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351 Py_DECREF(str);
9352 Py_DECREF(substr);
9353 return result;
9354}
9355
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356/* Apply fixfct filter to the Unicode object self and return a
9357 reference to the modified object */
9358
Alexander Belopolsky40018472011-02-26 01:02:56 +00009359static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009360fixup(PyObject *self,
9361 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 PyObject *u;
9364 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009365 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009367 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009370 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 /* fix functions return the new maximum character in a string,
9373 if the kind of the resulting unicode object does not change,
9374 everything is fine. Otherwise we need to change the string kind
9375 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009376 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009377
9378 if (maxchar_new == 0) {
9379 /* no changes */;
9380 if (PyUnicode_CheckExact(self)) {
9381 Py_DECREF(u);
9382 Py_INCREF(self);
9383 return self;
9384 }
9385 else
9386 return u;
9387 }
9388
Victor Stinnere6abb482012-05-02 01:15:40 +02009389 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390
Victor Stinnereaab6042011-12-11 22:22:39 +01009391 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009393
9394 /* In case the maximum character changed, we need to
9395 convert the string to the new category. */
9396 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9397 if (v == NULL) {
9398 Py_DECREF(u);
9399 return NULL;
9400 }
9401 if (maxchar_new > maxchar_old) {
9402 /* If the maxchar increased so that the kind changed, not all
9403 characters are representable anymore and we need to fix the
9404 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009405 _PyUnicode_FastCopyCharacters(v, 0,
9406 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009407 maxchar_old = fixfct(v);
9408 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 }
9410 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009411 _PyUnicode_FastCopyCharacters(v, 0,
9412 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009414 Py_DECREF(u);
9415 assert(_PyUnicode_CheckConsistency(v, 1));
9416 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417}
9418
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009419static PyObject *
9420ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009422 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9423 char *resdata, *data = PyUnicode_DATA(self);
9424 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009425
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009426 res = PyUnicode_New(len, 127);
9427 if (res == NULL)
9428 return NULL;
9429 resdata = PyUnicode_DATA(res);
9430 if (lower)
9431 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009433 _Py_bytes_upper(resdata, data, len);
9434 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435}
9436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009438handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009440 Py_ssize_t j;
9441 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009442 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009443 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009444
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009445 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9446
9447 where ! is a negation and \p{xxx} is a character with property xxx.
9448 */
9449 for (j = i - 1; j >= 0; j--) {
9450 c = PyUnicode_READ(kind, data, j);
9451 if (!_PyUnicode_IsCaseIgnorable(c))
9452 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009454 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9455 if (final_sigma) {
9456 for (j = i + 1; j < length; j++) {
9457 c = PyUnicode_READ(kind, data, j);
9458 if (!_PyUnicode_IsCaseIgnorable(c))
9459 break;
9460 }
9461 final_sigma = j == length || !_PyUnicode_IsCased(c);
9462 }
9463 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464}
9465
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009466static int
9467lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9468 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009470 /* Obscure special case. */
9471 if (c == 0x3A3) {
9472 mapped[0] = handle_capital_sigma(kind, data, length, i);
9473 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009475 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476}
9477
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009478static Py_ssize_t
9479do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009481 Py_ssize_t i, k = 0;
9482 int n_res, j;
9483 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009484
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009485 c = PyUnicode_READ(kind, data, 0);
9486 n_res = _PyUnicode_ToUpperFull(c, mapped);
9487 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009488 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009489 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009491 for (i = 1; i < length; i++) {
9492 c = PyUnicode_READ(kind, data, i);
9493 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9494 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009495 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009496 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009497 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009498 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009499 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500}
9501
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009502static Py_ssize_t
9503do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9504 Py_ssize_t i, k = 0;
9505
9506 for (i = 0; i < length; i++) {
9507 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9508 int n_res, j;
9509 if (Py_UNICODE_ISUPPER(c)) {
9510 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9511 }
9512 else if (Py_UNICODE_ISLOWER(c)) {
9513 n_res = _PyUnicode_ToUpperFull(c, mapped);
9514 }
9515 else {
9516 n_res = 1;
9517 mapped[0] = c;
9518 }
9519 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009520 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009521 res[k++] = mapped[j];
9522 }
9523 }
9524 return k;
9525}
9526
9527static Py_ssize_t
9528do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9529 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009531 Py_ssize_t i, k = 0;
9532
9533 for (i = 0; i < length; i++) {
9534 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9535 int n_res, j;
9536 if (lower)
9537 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9538 else
9539 n_res = _PyUnicode_ToUpperFull(c, mapped);
9540 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009541 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009542 res[k++] = mapped[j];
9543 }
9544 }
9545 return k;
9546}
9547
9548static Py_ssize_t
9549do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9550{
9551 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9552}
9553
9554static Py_ssize_t
9555do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9556{
9557 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9558}
9559
Benjamin Petersone51757f2012-01-12 21:10:29 -05009560static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009561do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9562{
9563 Py_ssize_t i, k = 0;
9564
9565 for (i = 0; i < length; i++) {
9566 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9567 Py_UCS4 mapped[3];
9568 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9569 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009570 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009571 res[k++] = mapped[j];
9572 }
9573 }
9574 return k;
9575}
9576
9577static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009578do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9579{
9580 Py_ssize_t i, k = 0;
9581 int previous_is_cased;
9582
9583 previous_is_cased = 0;
9584 for (i = 0; i < length; i++) {
9585 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9586 Py_UCS4 mapped[3];
9587 int n_res, j;
9588
9589 if (previous_is_cased)
9590 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9591 else
9592 n_res = _PyUnicode_ToTitleFull(c, mapped);
9593
9594 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009595 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009596 res[k++] = mapped[j];
9597 }
9598
9599 previous_is_cased = _PyUnicode_IsCased(c);
9600 }
9601 return k;
9602}
9603
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009604static PyObject *
9605case_operation(PyObject *self,
9606 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9607{
9608 PyObject *res = NULL;
9609 Py_ssize_t length, newlength = 0;
9610 int kind, outkind;
9611 void *data, *outdata;
9612 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9613
Benjamin Petersoneea48462012-01-16 14:28:50 -05009614 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009615
9616 kind = PyUnicode_KIND(self);
9617 data = PyUnicode_DATA(self);
9618 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009619 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009620 PyErr_SetString(PyExc_OverflowError, "string is too long");
9621 return NULL;
9622 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009623 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624 if (tmp == NULL)
9625 return PyErr_NoMemory();
9626 newlength = perform(kind, data, length, tmp, &maxchar);
9627 res = PyUnicode_New(newlength, maxchar);
9628 if (res == NULL)
9629 goto leave;
9630 tmpend = tmp + newlength;
9631 outdata = PyUnicode_DATA(res);
9632 outkind = PyUnicode_KIND(res);
9633 switch (outkind) {
9634 case PyUnicode_1BYTE_KIND:
9635 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9636 break;
9637 case PyUnicode_2BYTE_KIND:
9638 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9639 break;
9640 case PyUnicode_4BYTE_KIND:
9641 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9642 break;
9643 default:
9644 assert(0);
9645 break;
9646 }
9647 leave:
9648 PyMem_FREE(tmp);
9649 return res;
9650}
9651
Tim Peters8ce9f162004-08-27 01:49:32 +00009652PyObject *
9653PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009656 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009658 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009659 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9660 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009661 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009663 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009665 int use_memcpy;
9666 unsigned char *res_data = NULL, *sep_data = NULL;
9667 PyObject *last_obj;
9668 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009670 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009671 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009672 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009673 }
9674
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009675 /* NOTE: the following code can't call back into Python code,
9676 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009677 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009678
Tim Peters05eba1f2004-08-27 21:32:02 +00009679 seqlen = PySequence_Fast_GET_SIZE(fseq);
9680 /* If empty sequence, return u"". */
9681 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009682 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009683 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009684 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009685
Tim Peters05eba1f2004-08-27 21:32:02 +00009686 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009687 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009688 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009689 if (seqlen == 1) {
9690 if (PyUnicode_CheckExact(items[0])) {
9691 res = items[0];
9692 Py_INCREF(res);
9693 Py_DECREF(fseq);
9694 return res;
9695 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009696 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009697 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009698 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009699 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009700 /* Set up sep and seplen */
9701 if (separator == NULL) {
9702 /* fall back to a blank space separator */
9703 sep = PyUnicode_FromOrdinal(' ');
9704 if (!sep)
9705 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009706 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009707 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009708 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009709 else {
9710 if (!PyUnicode_Check(separator)) {
9711 PyErr_Format(PyExc_TypeError,
9712 "separator: expected str instance,"
9713 " %.80s found",
9714 Py_TYPE(separator)->tp_name);
9715 goto onError;
9716 }
9717 if (PyUnicode_READY(separator))
9718 goto onError;
9719 sep = separator;
9720 seplen = PyUnicode_GET_LENGTH(separator);
9721 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9722 /* inc refcount to keep this code path symmetric with the
9723 above case of a blank separator */
9724 Py_INCREF(sep);
9725 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009726 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009727 }
9728
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009729 /* There are at least two things to join, or else we have a subclass
9730 * of str in the sequence.
9731 * Do a pre-pass to figure out the total amount of space we'll
9732 * need (sz), and see whether all argument are strings.
9733 */
9734 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009735#ifdef Py_DEBUG
9736 use_memcpy = 0;
9737#else
9738 use_memcpy = 1;
9739#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009740 for (i = 0; i < seqlen; i++) {
9741 const Py_ssize_t old_sz = sz;
9742 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 if (!PyUnicode_Check(item)) {
9744 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009745 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 " %.80s found",
9747 i, Py_TYPE(item)->tp_name);
9748 goto onError;
9749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (PyUnicode_READY(item) == -1)
9751 goto onError;
9752 sz += PyUnicode_GET_LENGTH(item);
9753 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009754 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009755 if (i != 0)
9756 sz += seplen;
9757 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9758 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009759 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009760 goto onError;
9761 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009762 if (use_memcpy && last_obj != NULL) {
9763 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9764 use_memcpy = 0;
9765 }
9766 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009767 }
Tim Petersced69f82003-09-16 20:30:58 +00009768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009770 if (res == NULL)
9771 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009772
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009773 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009774#ifdef Py_DEBUG
9775 use_memcpy = 0;
9776#else
9777 if (use_memcpy) {
9778 res_data = PyUnicode_1BYTE_DATA(res);
9779 kind = PyUnicode_KIND(res);
9780 if (seplen != 0)
9781 sep_data = PyUnicode_1BYTE_DATA(sep);
9782 }
9783#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009784 if (use_memcpy) {
9785 for (i = 0; i < seqlen; ++i) {
9786 Py_ssize_t itemlen;
9787 item = items[i];
9788
9789 /* Copy item, and maybe the separator. */
9790 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009791 Py_MEMCPY(res_data,
9792 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009793 kind * seplen);
9794 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009795 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009796
9797 itemlen = PyUnicode_GET_LENGTH(item);
9798 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009799 Py_MEMCPY(res_data,
9800 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009801 kind * itemlen);
9802 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009803 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009804 }
9805 assert(res_data == PyUnicode_1BYTE_DATA(res)
9806 + kind * PyUnicode_GET_LENGTH(res));
9807 }
9808 else {
9809 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9810 Py_ssize_t itemlen;
9811 item = items[i];
9812
9813 /* Copy item, and maybe the separator. */
9814 if (i && seplen != 0) {
9815 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9816 res_offset += seplen;
9817 }
9818
9819 itemlen = PyUnicode_GET_LENGTH(item);
9820 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009821 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009822 res_offset += itemlen;
9823 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009824 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009825 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009826 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009827
Tim Peters05eba1f2004-08-27 21:32:02 +00009828 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009830 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009834 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009836 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837 return NULL;
9838}
9839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840#define FILL(kind, data, value, start, length) \
9841 do { \
9842 Py_ssize_t i_ = 0; \
9843 assert(kind != PyUnicode_WCHAR_KIND); \
9844 switch ((kind)) { \
9845 case PyUnicode_1BYTE_KIND: { \
9846 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009847 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 break; \
9849 } \
9850 case PyUnicode_2BYTE_KIND: { \
9851 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9852 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9853 break; \
9854 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009855 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9857 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9858 break; \
9859 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009860 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 } \
9862 } while (0)
9863
Victor Stinnerd3f08822012-05-29 12:57:52 +02009864void
9865_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9866 Py_UCS4 fill_char)
9867{
9868 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9869 const void *data = PyUnicode_DATA(unicode);
9870 assert(PyUnicode_IS_READY(unicode));
9871 assert(unicode_modifiable(unicode));
9872 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9873 assert(start >= 0);
9874 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9875 FILL(kind, data, fill_char, start, length);
9876}
9877
Victor Stinner3fe55312012-01-04 00:33:50 +01009878Py_ssize_t
9879PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9880 Py_UCS4 fill_char)
9881{
9882 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009883
9884 if (!PyUnicode_Check(unicode)) {
9885 PyErr_BadInternalCall();
9886 return -1;
9887 }
9888 if (PyUnicode_READY(unicode) == -1)
9889 return -1;
9890 if (unicode_check_modifiable(unicode))
9891 return -1;
9892
Victor Stinnerd3f08822012-05-29 12:57:52 +02009893 if (start < 0) {
9894 PyErr_SetString(PyExc_IndexError, "string index out of range");
9895 return -1;
9896 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009897 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9898 PyErr_SetString(PyExc_ValueError,
9899 "fill character is bigger than "
9900 "the string maximum character");
9901 return -1;
9902 }
9903
9904 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9905 length = Py_MIN(maxlen, length);
9906 if (length <= 0)
9907 return 0;
9908
Victor Stinnerd3f08822012-05-29 12:57:52 +02009909 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009910 return length;
9911}
9912
Victor Stinner9310abb2011-10-05 00:59:23 +02009913static PyObject *
9914pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009915 Py_ssize_t left,
9916 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 PyObject *u;
9920 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009921 int kind;
9922 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923
9924 if (left < 0)
9925 left = 0;
9926 if (right < 0)
9927 right = 0;
9928
Victor Stinnerc4b49542011-12-11 22:44:26 +01009929 if (left == 0 && right == 0)
9930 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9933 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009934 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9935 return NULL;
9936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009938 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009940 if (!u)
9941 return NULL;
9942
9943 kind = PyUnicode_KIND(u);
9944 data = PyUnicode_DATA(u);
9945 if (left)
9946 FILL(kind, data, fill, 0, left);
9947 if (right)
9948 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009949 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009950 assert(_PyUnicode_CheckConsistency(u, 1));
9951 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952}
9953
Alexander Belopolsky40018472011-02-26 01:02:56 +00009954PyObject *
9955PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958
9959 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009960 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009961 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009962 if (PyUnicode_READY(string) == -1) {
9963 Py_DECREF(string);
9964 return NULL;
9965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966
Benjamin Petersonead6b532011-12-20 17:23:42 -06009967 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009969 if (PyUnicode_IS_ASCII(string))
9970 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009972 PyUnicode_GET_LENGTH(string), keepends);
9973 else
9974 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009975 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009976 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 break;
9978 case PyUnicode_2BYTE_KIND:
9979 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009980 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 PyUnicode_GET_LENGTH(string), keepends);
9982 break;
9983 case PyUnicode_4BYTE_KIND:
9984 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009985 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 PyUnicode_GET_LENGTH(string), keepends);
9987 break;
9988 default:
9989 assert(0);
9990 list = 0;
9991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992 Py_DECREF(string);
9993 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994}
9995
Alexander Belopolsky40018472011-02-26 01:02:56 +00009996static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009997split(PyObject *self,
9998 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009999 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010001 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 void *buf1, *buf2;
10003 Py_ssize_t len1, len2;
10004 PyObject* out;
10005
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010007 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (PyUnicode_READY(self) == -1)
10010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010013 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010015 if (PyUnicode_IS_ASCII(self))
10016 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010018 PyUnicode_GET_LENGTH(self), maxcount
10019 );
10020 else
10021 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010022 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010023 PyUnicode_GET_LENGTH(self), maxcount
10024 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 case PyUnicode_2BYTE_KIND:
10026 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010027 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 PyUnicode_GET_LENGTH(self), maxcount
10029 );
10030 case PyUnicode_4BYTE_KIND:
10031 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010032 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 PyUnicode_GET_LENGTH(self), maxcount
10034 );
10035 default:
10036 assert(0);
10037 return NULL;
10038 }
10039
10040 if (PyUnicode_READY(substring) == -1)
10041 return NULL;
10042
10043 kind1 = PyUnicode_KIND(self);
10044 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 len1 = PyUnicode_GET_LENGTH(self);
10046 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010047 if (kind1 < kind2 || len1 < len2) {
10048 out = PyList_New(1);
10049 if (out == NULL)
10050 return NULL;
10051 Py_INCREF(self);
10052 PyList_SET_ITEM(out, 0, self);
10053 return out;
10054 }
10055 buf1 = PyUnicode_DATA(self);
10056 buf2 = PyUnicode_DATA(substring);
10057 if (kind2 != kind1) {
10058 buf2 = _PyUnicode_AsKind(substring, kind1);
10059 if (!buf2)
10060 return NULL;
10061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010063 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010065 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10066 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010067 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010068 else
10069 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010070 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 break;
10072 case PyUnicode_2BYTE_KIND:
10073 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010074 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 break;
10076 case PyUnicode_4BYTE_KIND:
10077 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010078 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 break;
10080 default:
10081 out = NULL;
10082 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010083 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 PyMem_Free(buf2);
10085 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086}
10087
Alexander Belopolsky40018472011-02-26 01:02:56 +000010088static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010089rsplit(PyObject *self,
10090 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010091 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010092{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010093 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 void *buf1, *buf2;
10095 Py_ssize_t len1, len2;
10096 PyObject* out;
10097
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010098 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010099 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 if (PyUnicode_READY(self) == -1)
10102 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010105 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010107 if (PyUnicode_IS_ASCII(self))
10108 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010109 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010110 PyUnicode_GET_LENGTH(self), maxcount
10111 );
10112 else
10113 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010114 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010115 PyUnicode_GET_LENGTH(self), maxcount
10116 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 case PyUnicode_2BYTE_KIND:
10118 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010119 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 PyUnicode_GET_LENGTH(self), maxcount
10121 );
10122 case PyUnicode_4BYTE_KIND:
10123 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010124 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 PyUnicode_GET_LENGTH(self), maxcount
10126 );
10127 default:
10128 assert(0);
10129 return NULL;
10130 }
10131
10132 if (PyUnicode_READY(substring) == -1)
10133 return NULL;
10134
10135 kind1 = PyUnicode_KIND(self);
10136 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 len1 = PyUnicode_GET_LENGTH(self);
10138 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010139 if (kind1 < kind2 || len1 < len2) {
10140 out = PyList_New(1);
10141 if (out == NULL)
10142 return NULL;
10143 Py_INCREF(self);
10144 PyList_SET_ITEM(out, 0, self);
10145 return out;
10146 }
10147 buf1 = PyUnicode_DATA(self);
10148 buf2 = PyUnicode_DATA(substring);
10149 if (kind2 != kind1) {
10150 buf2 = _PyUnicode_AsKind(substring, kind1);
10151 if (!buf2)
10152 return NULL;
10153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010155 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10158 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010160 else
10161 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010162 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 break;
10164 case PyUnicode_2BYTE_KIND:
10165 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010166 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 break;
10168 case PyUnicode_4BYTE_KIND:
10169 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010170 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 break;
10172 default:
10173 out = NULL;
10174 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010175 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 PyMem_Free(buf2);
10177 return out;
10178}
10179
10180static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010181anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10182 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010184 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10187 return asciilib_find(buf1, len1, buf2, len2, offset);
10188 else
10189 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 case PyUnicode_2BYTE_KIND:
10191 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10192 case PyUnicode_4BYTE_KIND:
10193 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10194 }
10195 assert(0);
10196 return -1;
10197}
10198
10199static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010200anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10201 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010203 switch (kind) {
10204 case PyUnicode_1BYTE_KIND:
10205 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10206 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10207 else
10208 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10209 case PyUnicode_2BYTE_KIND:
10210 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10211 case PyUnicode_4BYTE_KIND:
10212 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10213 }
10214 assert(0);
10215 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010216}
10217
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010218static void
10219replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10220 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10221{
10222 int kind = PyUnicode_KIND(u);
10223 void *data = PyUnicode_DATA(u);
10224 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10225 if (kind == PyUnicode_1BYTE_KIND) {
10226 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10227 (Py_UCS1 *)data + len,
10228 u1, u2, maxcount);
10229 }
10230 else if (kind == PyUnicode_2BYTE_KIND) {
10231 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10232 (Py_UCS2 *)data + len,
10233 u1, u2, maxcount);
10234 }
10235 else {
10236 assert(kind == PyUnicode_4BYTE_KIND);
10237 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10238 (Py_UCS4 *)data + len,
10239 u1, u2, maxcount);
10240 }
10241}
10242
Alexander Belopolsky40018472011-02-26 01:02:56 +000010243static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244replace(PyObject *self, PyObject *str1,
10245 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 PyObject *u;
10248 char *sbuf = PyUnicode_DATA(self);
10249 char *buf1 = PyUnicode_DATA(str1);
10250 char *buf2 = PyUnicode_DATA(str2);
10251 int srelease = 0, release1 = 0, release2 = 0;
10252 int skind = PyUnicode_KIND(self);
10253 int kind1 = PyUnicode_KIND(str1);
10254 int kind2 = PyUnicode_KIND(str2);
10255 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10256 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10257 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010258 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010259 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
10261 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010264 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265
Victor Stinner59de0ee2011-10-07 10:01:28 +020010266 if (str1 == str2)
10267 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268
Victor Stinner49a0a212011-10-12 23:46:10 +020010269 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010270 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10271 if (maxchar < maxchar_str1)
10272 /* substring too wide to be present */
10273 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010274 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10275 /* Replacing str1 with str2 may cause a maxchar reduction in the
10276 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010277 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010278 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010281 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010283 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010285 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010286 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010287 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010288
Victor Stinner69ed0f42013-04-09 21:48:24 +020010289 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010290 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010291 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010293 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010297
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010298 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10299 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010300 }
10301 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 int rkind = skind;
10303 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010304 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (kind1 < rkind) {
10307 /* widen substring */
10308 buf1 = _PyUnicode_AsKind(str1, rkind);
10309 if (!buf1) goto error;
10310 release1 = 1;
10311 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010312 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 if (i < 0)
10314 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (rkind > kind2) {
10316 /* widen replacement */
10317 buf2 = _PyUnicode_AsKind(str2, rkind);
10318 if (!buf2) goto error;
10319 release2 = 1;
10320 }
10321 else if (rkind < kind2) {
10322 /* widen self and buf1 */
10323 rkind = kind2;
10324 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010325 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 sbuf = _PyUnicode_AsKind(self, rkind);
10327 if (!sbuf) goto error;
10328 srelease = 1;
10329 buf1 = _PyUnicode_AsKind(str1, rkind);
10330 if (!buf1) goto error;
10331 release1 = 1;
10332 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010333 u = PyUnicode_New(slen, maxchar);
10334 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010336 assert(PyUnicode_KIND(u) == rkind);
10337 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010338
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010339 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010340 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010341 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010343 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010345
10346 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010348 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010350 if (i == -1)
10351 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010352 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010358 }
10359 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010361 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 int rkind = skind;
10363 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010366 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 buf1 = _PyUnicode_AsKind(str1, rkind);
10368 if (!buf1) goto error;
10369 release1 = 1;
10370 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010372 if (n == 0)
10373 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010375 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 buf2 = _PyUnicode_AsKind(str2, rkind);
10377 if (!buf2) goto error;
10378 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010381 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 rkind = kind2;
10383 sbuf = _PyUnicode_AsKind(self, rkind);
10384 if (!sbuf) goto error;
10385 srelease = 1;
10386 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010387 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 buf1 = _PyUnicode_AsKind(str1, rkind);
10389 if (!buf1) goto error;
10390 release1 = 1;
10391 }
10392 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10393 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010394 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 PyErr_SetString(PyExc_OverflowError,
10396 "replace string is too long");
10397 goto error;
10398 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010399 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010400 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010401 _Py_INCREF_UNICODE_EMPTY();
10402 if (!unicode_empty)
10403 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010404 u = unicode_empty;
10405 goto done;
10406 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010407 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 PyErr_SetString(PyExc_OverflowError,
10409 "replace string is too long");
10410 goto error;
10411 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010412 u = PyUnicode_New(new_size, maxchar);
10413 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010415 assert(PyUnicode_KIND(u) == rkind);
10416 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 ires = i = 0;
10418 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010419 while (n-- > 0) {
10420 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010422 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010424 if (j == -1)
10425 break;
10426 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010427 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010428 memcpy(res + rkind * ires,
10429 sbuf + rkind * i,
10430 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010432 }
10433 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010435 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010437 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010443 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010444 memcpy(res + rkind * ires,
10445 sbuf + rkind * i,
10446 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010447 }
10448 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 /* interleave */
10450 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010451 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010453 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 if (--n <= 0)
10456 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010457 memcpy(res + rkind * ires,
10458 sbuf + rkind * i,
10459 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 ires++;
10461 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010462 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 memcpy(res + rkind * ires,
10464 sbuf + rkind * i,
10465 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010467 }
10468
10469 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010470 unicode_adjust_maxchar(&u);
10471 if (u == NULL)
10472 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010474
10475 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (srelease)
10477 PyMem_FREE(sbuf);
10478 if (release1)
10479 PyMem_FREE(buf1);
10480 if (release2)
10481 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010482 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484
Benjamin Peterson29060642009-01-31 22:14:21 +000010485 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (srelease)
10488 PyMem_FREE(sbuf);
10489 if (release1)
10490 PyMem_FREE(buf1);
10491 if (release2)
10492 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010493 return unicode_result_unchanged(self);
10494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 error:
10496 if (srelease && sbuf)
10497 PyMem_FREE(sbuf);
10498 if (release1 && buf1)
10499 PyMem_FREE(buf1);
10500 if (release2 && buf2)
10501 PyMem_FREE(buf2);
10502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503}
10504
10505/* --- Unicode Object Methods --------------------------------------------- */
10506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010507PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010508 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509\n\
10510Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010511characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512
10513static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010514unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010516 if (PyUnicode_READY(self) == -1)
10517 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010518 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519}
10520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010521PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523\n\
10524Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010525have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
10527static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010528unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010530 if (PyUnicode_READY(self) == -1)
10531 return NULL;
10532 if (PyUnicode_GET_LENGTH(self) == 0)
10533 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010534 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535}
10536
Benjamin Petersond5890c82012-01-14 13:23:30 -050010537PyDoc_STRVAR(casefold__doc__,
10538 "S.casefold() -> str\n\
10539\n\
10540Return a version of S suitable for caseless comparisons.");
10541
10542static PyObject *
10543unicode_casefold(PyObject *self)
10544{
10545 if (PyUnicode_READY(self) == -1)
10546 return NULL;
10547 if (PyUnicode_IS_ASCII(self))
10548 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010549 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010550}
10551
10552
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010553/* Argument converter. Coerces to a single unicode character */
10554
10555static int
10556convert_uc(PyObject *obj, void *addr)
10557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010559 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010560
Benjamin Peterson14339b62009-01-31 16:36:08 +000010561 uniobj = PyUnicode_FromObject(obj);
10562 if (uniobj == NULL) {
10563 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010565 return 0;
10566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010568 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010570 Py_DECREF(uniobj);
10571 return 0;
10572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010574 Py_DECREF(uniobj);
10575 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010576}
10577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010578PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010581Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010582done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583
10584static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010585unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010587 Py_ssize_t marg, left;
10588 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 Py_UCS4 fillchar = ' ';
10590
Victor Stinnere9a29352011-10-01 02:14:59 +020010591 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
Benjamin Petersonbac79492012-01-14 13:34:47 -050010594 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595 return NULL;
10596
Victor Stinnerc4b49542011-12-11 22:44:26 +010010597 if (PyUnicode_GET_LENGTH(self) >= width)
10598 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599
Victor Stinnerc4b49542011-12-11 22:44:26 +010010600 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 left = marg / 2 + (marg & width & 1);
10602
Victor Stinner9310abb2011-10-05 00:59:23 +020010603 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604}
10605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606/* This function assumes that str1 and str2 are readied by the caller. */
10607
Marc-André Lemburge5034372000-08-08 08:04:29 +000010608static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010609unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010610{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010611#define COMPARE(TYPE1, TYPE2) \
10612 do { \
10613 TYPE1* p1 = (TYPE1 *)data1; \
10614 TYPE2* p2 = (TYPE2 *)data2; \
10615 TYPE1* end = p1 + len; \
10616 Py_UCS4 c1, c2; \
10617 for (; p1 != end; p1++, p2++) { \
10618 c1 = *p1; \
10619 c2 = *p2; \
10620 if (c1 != c2) \
10621 return (c1 < c2) ? -1 : 1; \
10622 } \
10623 } \
10624 while (0)
10625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 int kind1, kind2;
10627 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010628 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 kind1 = PyUnicode_KIND(str1);
10631 kind2 = PyUnicode_KIND(str2);
10632 data1 = PyUnicode_DATA(str1);
10633 data2 = PyUnicode_DATA(str2);
10634 len1 = PyUnicode_GET_LENGTH(str1);
10635 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010636 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010637
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010638 switch(kind1) {
10639 case PyUnicode_1BYTE_KIND:
10640 {
10641 switch(kind2) {
10642 case PyUnicode_1BYTE_KIND:
10643 {
10644 int cmp = memcmp(data1, data2, len);
10645 /* normalize result of memcmp() into the range [-1; 1] */
10646 if (cmp < 0)
10647 return -1;
10648 if (cmp > 0)
10649 return 1;
10650 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010651 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010652 case PyUnicode_2BYTE_KIND:
10653 COMPARE(Py_UCS1, Py_UCS2);
10654 break;
10655 case PyUnicode_4BYTE_KIND:
10656 COMPARE(Py_UCS1, Py_UCS4);
10657 break;
10658 default:
10659 assert(0);
10660 }
10661 break;
10662 }
10663 case PyUnicode_2BYTE_KIND:
10664 {
10665 switch(kind2) {
10666 case PyUnicode_1BYTE_KIND:
10667 COMPARE(Py_UCS2, Py_UCS1);
10668 break;
10669 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010670 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010671 COMPARE(Py_UCS2, Py_UCS2);
10672 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010673 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010674 case PyUnicode_4BYTE_KIND:
10675 COMPARE(Py_UCS2, Py_UCS4);
10676 break;
10677 default:
10678 assert(0);
10679 }
10680 break;
10681 }
10682 case PyUnicode_4BYTE_KIND:
10683 {
10684 switch(kind2) {
10685 case PyUnicode_1BYTE_KIND:
10686 COMPARE(Py_UCS4, Py_UCS1);
10687 break;
10688 case PyUnicode_2BYTE_KIND:
10689 COMPARE(Py_UCS4, Py_UCS2);
10690 break;
10691 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010692 {
10693#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10694 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10695 /* normalize result of wmemcmp() into the range [-1; 1] */
10696 if (cmp < 0)
10697 return -1;
10698 if (cmp > 0)
10699 return 1;
10700#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010701 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010702#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010703 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010704 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010705 default:
10706 assert(0);
10707 }
10708 break;
10709 }
10710 default:
10711 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010712 }
10713
Victor Stinner770e19e2012-10-04 22:59:45 +020010714 if (len1 == len2)
10715 return 0;
10716 if (len1 < len2)
10717 return -1;
10718 else
10719 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010720
10721#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010722}
10723
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010724Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010725unicode_compare_eq(PyObject *str1, PyObject *str2)
10726{
10727 int kind;
10728 void *data1, *data2;
10729 Py_ssize_t len;
10730 int cmp;
10731
Victor Stinnere5567ad2012-10-23 02:48:49 +020010732 len = PyUnicode_GET_LENGTH(str1);
10733 if (PyUnicode_GET_LENGTH(str2) != len)
10734 return 0;
10735 kind = PyUnicode_KIND(str1);
10736 if (PyUnicode_KIND(str2) != kind)
10737 return 0;
10738 data1 = PyUnicode_DATA(str1);
10739 data2 = PyUnicode_DATA(str2);
10740
10741 cmp = memcmp(data1, data2, len * kind);
10742 return (cmp == 0);
10743}
10744
10745
Alexander Belopolsky40018472011-02-26 01:02:56 +000010746int
10747PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10750 if (PyUnicode_READY(left) == -1 ||
10751 PyUnicode_READY(right) == -1)
10752 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010753
10754 /* a string is equal to itself */
10755 if (left == right)
10756 return 0;
10757
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010758 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010760 PyErr_Format(PyExc_TypeError,
10761 "Can't compare %.100s and %.100s",
10762 left->ob_type->tp_name,
10763 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764 return -1;
10765}
10766
Martin v. Löwis5b222132007-06-10 09:51:05 +000010767int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010768_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10769{
10770 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10771 if (right_str == NULL)
10772 return -1;
10773 return PyUnicode_Compare(left, right_str);
10774}
10775
10776int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010777PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 Py_ssize_t i;
10780 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 Py_UCS4 chr;
10782
Victor Stinner910337b2011-10-03 03:20:16 +020010783 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (PyUnicode_READY(uni) == -1)
10785 return -1;
10786 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010787 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010788 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010789 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010790 size_t len, len2 = strlen(str);
10791 int cmp;
10792
10793 len = Py_MIN(len1, len2);
10794 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010795 if (cmp != 0) {
10796 if (cmp < 0)
10797 return -1;
10798 else
10799 return 1;
10800 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010801 if (len1 > len2)
10802 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010803 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010804 return -1; /* str is longer */
10805 return 0;
10806 }
10807 else {
10808 void *data = PyUnicode_DATA(uni);
10809 /* Compare Unicode string and source character set string */
10810 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010811 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010812 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10813 /* This check keeps Python strings that end in '\0' from comparing equal
10814 to C strings identical up to that point. */
10815 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10816 return 1; /* uni is longer */
10817 if (str[i])
10818 return -1; /* str is longer */
10819 return 0;
10820 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010821}
10822
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010823
Benjamin Peterson29060642009-01-31 22:14:21 +000010824#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010825 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010826
Alexander Belopolsky40018472011-02-26 01:02:56 +000010827PyObject *
10828PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010829{
10830 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010831 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010832
Victor Stinnere5567ad2012-10-23 02:48:49 +020010833 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10834 Py_RETURN_NOTIMPLEMENTED;
10835
10836 if (PyUnicode_READY(left) == -1 ||
10837 PyUnicode_READY(right) == -1)
10838 return NULL;
10839
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010840 if (left == right) {
10841 switch (op) {
10842 case Py_EQ:
10843 case Py_LE:
10844 case Py_GE:
10845 /* a string is equal to itself */
10846 v = Py_True;
10847 break;
10848 case Py_NE:
10849 case Py_LT:
10850 case Py_GT:
10851 v = Py_False;
10852 break;
10853 default:
10854 PyErr_BadArgument();
10855 return NULL;
10856 }
10857 }
10858 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010859 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010860 result ^= (op == Py_NE);
10861 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010862 }
10863 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010864 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010865
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010866 /* Convert the return value to a Boolean */
10867 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010868 case Py_LE:
10869 v = TEST_COND(result <= 0);
10870 break;
10871 case Py_GE:
10872 v = TEST_COND(result >= 0);
10873 break;
10874 case Py_LT:
10875 v = TEST_COND(result == -1);
10876 break;
10877 case Py_GT:
10878 v = TEST_COND(result == 1);
10879 break;
10880 default:
10881 PyErr_BadArgument();
10882 return NULL;
10883 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010884 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010885 Py_INCREF(v);
10886 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010887}
10888
Alexander Belopolsky40018472011-02-26 01:02:56 +000010889int
10890PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010891{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010892 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010893 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 void *buf1, *buf2;
10895 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010896 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010897
10898 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010899 sub = PyUnicode_FromObject(element);
10900 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 PyErr_Format(PyExc_TypeError,
10902 "'in <string>' requires string as left operand, not %s",
10903 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010905 }
10906
Thomas Wouters477c8d52006-05-27 19:21:47 +000010907 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010908 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010909 Py_DECREF(sub);
10910 return -1;
10911 }
10912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 kind1 = PyUnicode_KIND(str);
10914 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010915 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010917 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010918 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 }
10920 len1 = PyUnicode_GET_LENGTH(str);
10921 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010922 if (len1 < len2) {
10923 Py_DECREF(sub);
10924 Py_DECREF(str);
10925 return 0;
10926 }
10927 buf1 = PyUnicode_DATA(str);
10928 buf2 = PyUnicode_DATA(sub);
10929 if (len2 == 1) {
10930 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10931 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10932 Py_DECREF(sub);
10933 Py_DECREF(str);
10934 return result;
10935 }
10936 if (kind2 != kind1) {
10937 buf2 = _PyUnicode_AsKind(sub, kind1);
10938 if (!buf2) {
10939 Py_DECREF(sub);
10940 Py_DECREF(str);
10941 return -1;
10942 }
10943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944
Victor Stinner77282cb2013-04-14 19:22:47 +020010945 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 case PyUnicode_1BYTE_KIND:
10947 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10948 break;
10949 case PyUnicode_2BYTE_KIND:
10950 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10951 break;
10952 case PyUnicode_4BYTE_KIND:
10953 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10954 break;
10955 default:
10956 result = -1;
10957 assert(0);
10958 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010959
10960 Py_DECREF(str);
10961 Py_DECREF(sub);
10962
Victor Stinner77282cb2013-04-14 19:22:47 +020010963 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 PyMem_Free(buf2);
10965
Guido van Rossum403d68b2000-03-13 15:55:09 +000010966 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010967}
10968
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969/* Concat to string or Unicode object giving a new Unicode object. */
10970
Alexander Belopolsky40018472011-02-26 01:02:56 +000010971PyObject *
10972PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010975 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010976 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
10978 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985
10986 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010987 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010991 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 }
10995
Victor Stinner488fa492011-12-12 00:01:39 +010010996 u_len = PyUnicode_GET_LENGTH(u);
10997 v_len = PyUnicode_GET_LENGTH(v);
10998 if (u_len > PY_SSIZE_T_MAX - v_len) {
10999 PyErr_SetString(PyExc_OverflowError,
11000 "strings are too large to concat");
11001 goto onError;
11002 }
11003 new_len = u_len + v_len;
11004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011006 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011007 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011010 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011013 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11014 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 Py_DECREF(u);
11016 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011017 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 Py_XDECREF(u);
11022 Py_XDECREF(v);
11023 return NULL;
11024}
11025
Walter Dörwald1ab83302007-05-18 17:15:44 +000011026void
Victor Stinner23e56682011-10-03 03:54:37 +020011027PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011028{
Victor Stinner23e56682011-10-03 03:54:37 +020011029 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011030 Py_UCS4 maxchar, maxchar2;
11031 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011032
11033 if (p_left == NULL) {
11034 if (!PyErr_Occurred())
11035 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011036 return;
11037 }
Victor Stinner23e56682011-10-03 03:54:37 +020011038 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011039 if (right == NULL || left == NULL
11040 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011041 if (!PyErr_Occurred())
11042 PyErr_BadInternalCall();
11043 goto error;
11044 }
11045
Benjamin Petersonbac79492012-01-14 13:34:47 -050011046 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011047 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011048 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011049 goto error;
11050
Victor Stinner488fa492011-12-12 00:01:39 +010011051 /* Shortcuts */
11052 if (left == unicode_empty) {
11053 Py_DECREF(left);
11054 Py_INCREF(right);
11055 *p_left = right;
11056 return;
11057 }
11058 if (right == unicode_empty)
11059 return;
11060
11061 left_len = PyUnicode_GET_LENGTH(left);
11062 right_len = PyUnicode_GET_LENGTH(right);
11063 if (left_len > PY_SSIZE_T_MAX - right_len) {
11064 PyErr_SetString(PyExc_OverflowError,
11065 "strings are too large to concat");
11066 goto error;
11067 }
11068 new_len = left_len + right_len;
11069
11070 if (unicode_modifiable(left)
11071 && PyUnicode_CheckExact(right)
11072 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011073 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11074 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011075 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011076 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011077 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11078 {
11079 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011080 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011081 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011082
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011083 /* copy 'right' into the newly allocated area of 'left' */
11084 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011085 }
Victor Stinner488fa492011-12-12 00:01:39 +010011086 else {
11087 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11088 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011089 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011090
Victor Stinner488fa492011-12-12 00:01:39 +010011091 /* Concat the two Unicode strings */
11092 res = PyUnicode_New(new_len, maxchar);
11093 if (res == NULL)
11094 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011095 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11096 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011097 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011098 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011099 }
11100 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011101 return;
11102
11103error:
Victor Stinner488fa492011-12-12 00:01:39 +010011104 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011105}
11106
11107void
11108PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11109{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011110 PyUnicode_Append(pleft, right);
11111 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011112}
11113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011114PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011118string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011119interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
11121static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011122unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011124 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011125 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011126 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011128 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 void *buf1, *buf2;
11130 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131
Jesus Ceaac451502011-04-20 17:09:23 +020011132 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11133 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 kind1 = PyUnicode_KIND(self);
11137 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011138 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011139 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011140 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 len1 = PyUnicode_GET_LENGTH(self);
11143 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011145 if (end - start < len2) {
11146 Py_DECREF(substring);
11147 return PyLong_FromLong(0);
11148 }
11149 buf1 = PyUnicode_DATA(self);
11150 buf2 = PyUnicode_DATA(substring);
11151 if (kind2 != kind1) {
11152 buf2 = _PyUnicode_AsKind(substring, kind1);
11153 if (!buf2) {
11154 Py_DECREF(substring);
11155 return NULL;
11156 }
11157 }
11158 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 case PyUnicode_1BYTE_KIND:
11160 iresult = ucs1lib_count(
11161 ((Py_UCS1*)buf1) + start, end - start,
11162 buf2, len2, PY_SSIZE_T_MAX
11163 );
11164 break;
11165 case PyUnicode_2BYTE_KIND:
11166 iresult = ucs2lib_count(
11167 ((Py_UCS2*)buf1) + start, end - start,
11168 buf2, len2, PY_SSIZE_T_MAX
11169 );
11170 break;
11171 case PyUnicode_4BYTE_KIND:
11172 iresult = ucs4lib_count(
11173 ((Py_UCS4*)buf1) + start, end - start,
11174 buf2, len2, PY_SSIZE_T_MAX
11175 );
11176 break;
11177 default:
11178 assert(0); iresult = 0;
11179 }
11180
11181 result = PyLong_FromSsize_t(iresult);
11182
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011183 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
11186 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011187
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 return result;
11189}
11190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011192 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011194Encode S using the codec registered for encoding. Default encoding\n\
11195is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011196handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011197a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11198'xmlcharrefreplace' as well as any other name registered with\n\
11199codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200
11201static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011202unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011204 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 char *encoding = NULL;
11206 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011207
Benjamin Peterson308d6372009-09-18 21:42:35 +000011208 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11209 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011211 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011212}
11213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011214PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011215 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216\n\
11217Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011218If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219
11220static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011221unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011223 Py_ssize_t i, j, line_pos, src_len, incr;
11224 Py_UCS4 ch;
11225 PyObject *u;
11226 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011227 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011229 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011230 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
Ezio Melotti745d54d2013-11-16 19:10:57 +020011232 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11233 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
Antoine Pitrou22425222011-10-04 19:10:51 +020011236 if (PyUnicode_READY(self) == -1)
11237 return NULL;
11238
Thomas Wouters7e474022000-07-16 12:04:32 +000011239 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011240 src_len = PyUnicode_GET_LENGTH(self);
11241 i = j = line_pos = 0;
11242 kind = PyUnicode_KIND(self);
11243 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011244 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011245 for (; i < src_len; i++) {
11246 ch = PyUnicode_READ(kind, src_data, i);
11247 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011248 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011250 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 goto overflow;
11253 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011255 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011259 goto overflow;
11260 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011262 if (ch == '\n' || ch == '\r')
11263 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011266 if (!found)
11267 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011268
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271 if (!u)
11272 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011273 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 for (; i < src_len; i++) {
11278 ch = PyUnicode_READ(kind, src_data, i);
11279 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011281 incr = tabsize - (line_pos % tabsize);
11282 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011283 FILL(kind, dest_data, ' ', j, incr);
11284 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011286 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011288 line_pos++;
11289 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011290 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011291 if (ch == '\n' || ch == '\r')
11292 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011294 }
11295 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011296 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011297
Antoine Pitroue71d5742011-10-04 15:55:09 +020011298 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011299 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11300 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301}
11302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011303PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305\n\
11306Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011307such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308arguments start and end are interpreted as in slice notation.\n\
11309\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011310Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311
11312static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011315 /* initialize variables to prevent gcc warning */
11316 PyObject *substring = NULL;
11317 Py_ssize_t start = 0;
11318 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011319 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
Jesus Ceaac451502011-04-20 17:09:23 +020011321 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11322 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324
Christian Heimesd47802e2013-06-29 21:33:36 +020011325 if (PyUnicode_READY(self) == -1) {
11326 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011328 }
11329 if (PyUnicode_READY(substring) == -1) {
11330 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333
Victor Stinner7931d9a2011-11-04 00:22:48 +010011334 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
11336 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (result == -2)
11339 return NULL;
11340
Christian Heimes217cfd12007-12-02 14:31:20 +000011341 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342}
11343
11344static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011345unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011347 void *data;
11348 enum PyUnicode_Kind kind;
11349 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011350
11351 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11352 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011354 }
11355 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11356 PyErr_SetString(PyExc_IndexError, "string index out of range");
11357 return NULL;
11358 }
11359 kind = PyUnicode_KIND(self);
11360 data = PyUnicode_DATA(self);
11361 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011362 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363}
11364
Guido van Rossumc2504932007-09-18 19:42:40 +000011365/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011366 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011367static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011368unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369{
Guido van Rossumc2504932007-09-18 19:42:40 +000011370 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011371 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011372
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011373#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011374 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011375#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (_PyUnicode_HASH(self) != -1)
11377 return _PyUnicode_HASH(self);
11378 if (PyUnicode_READY(self) == -1)
11379 return -1;
11380 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011381 /*
11382 We make the hash of the empty string be 0, rather than using
11383 (prefix ^ suffix), since this slightly obfuscates the hash secret
11384 */
11385 if (len == 0) {
11386 _PyUnicode_HASH(self) = 0;
11387 return 0;
11388 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011389 x = _Py_HashBytes(PyUnicode_DATA(self),
11390 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011392 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393}
11394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011395PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011398Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399
11400static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011403 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011404 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011405 PyObject *substring = NULL;
11406 Py_ssize_t start = 0;
11407 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
Jesus Ceaac451502011-04-20 17:09:23 +020011409 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11410 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
Christian Heimesd47a0452013-06-29 21:21:37 +020011413 if (PyUnicode_READY(self) == -1) {
11414 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011416 }
11417 if (PyUnicode_READY(substring) == -1) {
11418 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421
Victor Stinner7931d9a2011-11-04 00:22:48 +010011422 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
11424 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 if (result == -2)
11427 return NULL;
11428
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 if (result < 0) {
11430 PyErr_SetString(PyExc_ValueError, "substring not found");
11431 return NULL;
11432 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011433
Christian Heimes217cfd12007-12-02 14:31:20 +000011434 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435}
11436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011440Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
11443static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011444unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_ssize_t i, length;
11447 int kind;
11448 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 int cased;
11450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (PyUnicode_READY(self) == -1)
11452 return NULL;
11453 length = PyUnicode_GET_LENGTH(self);
11454 kind = PyUnicode_KIND(self);
11455 data = PyUnicode_DATA(self);
11456
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 1)
11459 return PyBool_FromLong(
11460 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011462 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011465
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 for (i = 0; i < length; i++) {
11468 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011469
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11471 return PyBool_FromLong(0);
11472 else if (!cased && Py_UNICODE_ISLOWER(ch))
11473 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011475 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476}
11477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011481Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011482at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
11484static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011485unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 Py_ssize_t i, length;
11488 int kind;
11489 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490 int cased;
11491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (PyUnicode_READY(self) == -1)
11493 return NULL;
11494 length = PyUnicode_GET_LENGTH(self);
11495 kind = PyUnicode_KIND(self);
11496 data = PyUnicode_DATA(self);
11497
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (length == 1)
11500 return PyBool_FromLong(
11501 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011503 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011506
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 for (i = 0; i < length; i++) {
11509 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011510
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11512 return PyBool_FromLong(0);
11513 else if (!cased && Py_UNICODE_ISUPPER(ch))
11514 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011516 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517}
11518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011522Return True if S is a titlecased string and there is at least one\n\
11523character in S, i.e. upper- and titlecase characters may only\n\
11524follow uncased characters and lowercase characters only cased ones.\n\
11525Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526
11527static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011528unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 Py_ssize_t i, length;
11531 int kind;
11532 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533 int cased, previous_is_cased;
11534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (PyUnicode_READY(self) == -1)
11536 return NULL;
11537 length = PyUnicode_GET_LENGTH(self);
11538 kind = PyUnicode_KIND(self);
11539 data = PyUnicode_DATA(self);
11540
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (length == 1) {
11543 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11544 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11545 (Py_UNICODE_ISUPPER(ch) != 0));
11546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011548 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011551
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 cased = 0;
11553 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 for (i = 0; i < length; i++) {
11555 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011556
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11558 if (previous_is_cased)
11559 return PyBool_FromLong(0);
11560 previous_is_cased = 1;
11561 cased = 1;
11562 }
11563 else if (Py_UNICODE_ISLOWER(ch)) {
11564 if (!previous_is_cased)
11565 return PyBool_FromLong(0);
11566 previous_is_cased = 1;
11567 cased = 1;
11568 }
11569 else
11570 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011572 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573}
11574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011575PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011578Return True if all characters in S are whitespace\n\
11579and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
11581static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011582unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 Py_ssize_t i, length;
11585 int kind;
11586 void *data;
11587
11588 if (PyUnicode_READY(self) == -1)
11589 return NULL;
11590 length = PyUnicode_GET_LENGTH(self);
11591 kind = PyUnicode_KIND(self);
11592 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 if (length == 1)
11596 return PyBool_FromLong(
11597 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011599 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 for (i = 0; i < length; i++) {
11604 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011605 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011608 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609}
11610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011613\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011614Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011616
11617static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011618unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 Py_ssize_t i, length;
11621 int kind;
11622 void *data;
11623
11624 if (PyUnicode_READY(self) == -1)
11625 return NULL;
11626 length = PyUnicode_GET_LENGTH(self);
11627 kind = PyUnicode_KIND(self);
11628 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011629
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011630 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (length == 1)
11632 return PyBool_FromLong(
11633 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011634
11635 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 for (i = 0; i < length; i++) {
11640 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011643 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011644}
11645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011646PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011648\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011649Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011651
11652static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011653unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 int kind;
11656 void *data;
11657 Py_ssize_t len, i;
11658
11659 if (PyUnicode_READY(self) == -1)
11660 return NULL;
11661
11662 kind = PyUnicode_KIND(self);
11663 data = PyUnicode_DATA(self);
11664 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011665
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011666 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 if (len == 1) {
11668 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11669 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11670 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011671
11672 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 for (i = 0; i < len; i++) {
11677 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011678 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011680 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011681 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011682}
11683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011684PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011687Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011688False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
11690static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011691unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 Py_ssize_t i, length;
11694 int kind;
11695 void *data;
11696
11697 if (PyUnicode_READY(self) == -1)
11698 return NULL;
11699 length = PyUnicode_GET_LENGTH(self);
11700 kind = PyUnicode_KIND(self);
11701 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (length == 1)
11705 return PyBool_FromLong(
11706 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011708 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 for (i = 0; i < length; i++) {
11713 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011716 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717}
11718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011719PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011722Return True if all characters in S are digits\n\
11723and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
11725static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011726unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 Py_ssize_t i, length;
11729 int kind;
11730 void *data;
11731
11732 if (PyUnicode_READY(self) == -1)
11733 return NULL;
11734 length = PyUnicode_GET_LENGTH(self);
11735 kind = PyUnicode_KIND(self);
11736 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (length == 1) {
11740 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11741 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011744 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 for (i = 0; i < length; i++) {
11749 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011752 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753}
11754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011755PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011758Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011759False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760
11761static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011762unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 Py_ssize_t i, length;
11765 int kind;
11766 void *data;
11767
11768 if (PyUnicode_READY(self) == -1)
11769 return NULL;
11770 length = PyUnicode_GET_LENGTH(self);
11771 kind = PyUnicode_KIND(self);
11772 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (length == 1)
11776 return PyBool_FromLong(
11777 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011779 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 for (i = 0; i < length; i++) {
11784 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011787 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788}
11789
Martin v. Löwis47383402007-08-15 07:32:56 +000011790int
11791PyUnicode_IsIdentifier(PyObject *self)
11792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 int kind;
11794 void *data;
11795 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011796 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 if (PyUnicode_READY(self) == -1) {
11799 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 }
11802
11803 /* Special case for empty strings */
11804 if (PyUnicode_GET_LENGTH(self) == 0)
11805 return 0;
11806 kind = PyUnicode_KIND(self);
11807 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011808
11809 /* PEP 3131 says that the first character must be in
11810 XID_Start and subsequent characters in XID_Continue,
11811 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011812 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011813 letters, digits, underscore). However, given the current
11814 definition of XID_Start and XID_Continue, it is sufficient
11815 to check just for these, except that _ must be allowed
11816 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011818 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011819 return 0;
11820
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011821 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011824 return 1;
11825}
11826
11827PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011829\n\
11830Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011831to the language definition.\n\
11832\n\
11833Use keyword.iskeyword() to test for reserved identifiers\n\
11834such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011835
11836static PyObject*
11837unicode_isidentifier(PyObject *self)
11838{
11839 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11840}
11841
Georg Brandl559e5d72008-06-11 18:37:52 +000011842PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011844\n\
11845Return True if all characters in S are considered\n\
11846printable in repr() or S is empty, False otherwise.");
11847
11848static PyObject*
11849unicode_isprintable(PyObject *self)
11850{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 Py_ssize_t i, length;
11852 int kind;
11853 void *data;
11854
11855 if (PyUnicode_READY(self) == -1)
11856 return NULL;
11857 length = PyUnicode_GET_LENGTH(self);
11858 kind = PyUnicode_KIND(self);
11859 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011860
11861 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (length == 1)
11863 return PyBool_FromLong(
11864 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 for (i = 0; i < length; i++) {
11867 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011868 Py_RETURN_FALSE;
11869 }
11870 }
11871 Py_RETURN_TRUE;
11872}
11873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011874PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011875 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876\n\
11877Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011878iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
11880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011881unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011883 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884}
11885
Martin v. Löwis18e16552006-02-15 17:27:45 +000011886static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011887unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 if (PyUnicode_READY(self) == -1)
11890 return -1;
11891 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892}
11893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011894PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011897Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011898done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899
11900static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011901unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011903 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 Py_UCS4 fillchar = ' ';
11905
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011906 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 return NULL;
11908
Benjamin Petersonbac79492012-01-14 13:34:47 -050011909 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
Victor Stinnerc4b49542011-12-11 22:44:26 +010011912 if (PyUnicode_GET_LENGTH(self) >= width)
11913 return unicode_result_unchanged(self);
11914
11915 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916}
11917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011918PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011921Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
11923static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011924unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011926 if (PyUnicode_READY(self) == -1)
11927 return NULL;
11928 if (PyUnicode_IS_ASCII(self))
11929 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011930 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931}
11932
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011933#define LEFTSTRIP 0
11934#define RIGHTSTRIP 1
11935#define BOTHSTRIP 2
11936
11937/* Arrays indexed by above */
11938static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11939
11940#define STRIPNAME(i) (stripformat[i]+3)
11941
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011942/* externally visible for str.strip(unicode) */
11943PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011944_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 void *data;
11947 int kind;
11948 Py_ssize_t i, j, len;
11949 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011950 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11953 return NULL;
11954
11955 kind = PyUnicode_KIND(self);
11956 data = PyUnicode_DATA(self);
11957 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011958 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11960 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011961 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011962
Benjamin Peterson14339b62009-01-31 16:36:08 +000011963 i = 0;
11964 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011965 while (i < len) {
11966 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11967 if (!BLOOM(sepmask, ch))
11968 break;
11969 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11970 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 i++;
11972 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011973 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011974
Benjamin Peterson14339b62009-01-31 16:36:08 +000011975 j = len;
11976 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011977 j--;
11978 while (j >= i) {
11979 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11980 if (!BLOOM(sepmask, ch))
11981 break;
11982 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11983 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011985 }
11986
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011988 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011989
Victor Stinner7931d9a2011-11-04 00:22:48 +010011990 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991}
11992
11993PyObject*
11994PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11995{
11996 unsigned char *data;
11997 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011998 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999
Victor Stinnerde636f32011-10-01 03:55:54 +020012000 if (PyUnicode_READY(self) == -1)
12001 return NULL;
12002
Victor Stinner684d5fd2012-05-03 02:32:34 +020012003 length = PyUnicode_GET_LENGTH(self);
12004 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012005
Victor Stinner684d5fd2012-05-03 02:32:34 +020012006 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012007 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008
Victor Stinnerde636f32011-10-01 03:55:54 +020012009 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012010 PyErr_SetString(PyExc_IndexError, "string index out of range");
12011 return NULL;
12012 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012013 if (start >= length || end < start)
12014 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012015
Victor Stinner684d5fd2012-05-03 02:32:34 +020012016 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012017 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012018 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012019 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012020 }
12021 else {
12022 kind = PyUnicode_KIND(self);
12023 data = PyUnicode_1BYTE_DATA(self);
12024 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012025 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012026 length);
12027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
12030static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012031do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 Py_ssize_t len, i, j;
12034
12035 if (PyUnicode_READY(self) == -1)
12036 return NULL;
12037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012039
Victor Stinnercc7af722013-04-09 22:39:24 +020012040 if (PyUnicode_IS_ASCII(self)) {
12041 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12042
12043 i = 0;
12044 if (striptype != RIGHTSTRIP) {
12045 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012046 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012047 if (!_Py_ascii_whitespace[ch])
12048 break;
12049 i++;
12050 }
12051 }
12052
12053 j = len;
12054 if (striptype != LEFTSTRIP) {
12055 j--;
12056 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012057 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012058 if (!_Py_ascii_whitespace[ch])
12059 break;
12060 j--;
12061 }
12062 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012063 }
12064 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012065 else {
12066 int kind = PyUnicode_KIND(self);
12067 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012068
Victor Stinnercc7af722013-04-09 22:39:24 +020012069 i = 0;
12070 if (striptype != RIGHTSTRIP) {
12071 while (i < len) {
12072 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12073 if (!Py_UNICODE_ISSPACE(ch))
12074 break;
12075 i++;
12076 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012077 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012078
12079 j = len;
12080 if (striptype != LEFTSTRIP) {
12081 j--;
12082 while (j >= i) {
12083 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12084 if (!Py_UNICODE_ISSPACE(ch))
12085 break;
12086 j--;
12087 }
12088 j++;
12089 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012091
Victor Stinner7931d9a2011-11-04 00:22:48 +010012092 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093}
12094
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012095
12096static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012097do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012098{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012099 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012100
Serhiy Storchakac6792272013-10-19 21:03:34 +030012101 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012102 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103
Benjamin Peterson14339b62009-01-31 16:36:08 +000012104 if (sep != NULL && sep != Py_None) {
12105 if (PyUnicode_Check(sep))
12106 return _PyUnicode_XStrip(self, striptype, sep);
12107 else {
12108 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 "%s arg must be None or str",
12110 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111 return NULL;
12112 }
12113 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012114
Benjamin Peterson14339b62009-01-31 16:36:08 +000012115 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116}
12117
12118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012119PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121\n\
12122Return a copy of the string S with leading and trailing\n\
12123whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012124If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012125
12126static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012127unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012129 if (PyTuple_GET_SIZE(args) == 0)
12130 return do_strip(self, BOTHSTRIP); /* Common case */
12131 else
12132 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133}
12134
12135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012136PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012137 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138\n\
12139Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012140If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012141
12142static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012143unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012145 if (PyTuple_GET_SIZE(args) == 0)
12146 return do_strip(self, LEFTSTRIP); /* Common case */
12147 else
12148 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149}
12150
12151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012152PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154\n\
12155Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012156If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012157
12158static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012159unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012161 if (PyTuple_GET_SIZE(args) == 0)
12162 return do_strip(self, RIGHTSTRIP); /* Common case */
12163 else
12164 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012165}
12166
12167
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012169unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012171 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173
Serhiy Storchaka05997252013-01-26 12:14:02 +020012174 if (len < 1)
12175 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
Victor Stinnerc4b49542011-12-11 22:44:26 +010012177 /* no repeat, return original string */
12178 if (len == 1)
12179 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012180
Benjamin Petersonbac79492012-01-14 13:34:47 -050012181 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 return NULL;
12183
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012184 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012185 PyErr_SetString(PyExc_OverflowError,
12186 "repeated string is too long");
12187 return NULL;
12188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012190
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012191 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192 if (!u)
12193 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012194 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (PyUnicode_GET_LENGTH(str) == 1) {
12197 const int kind = PyUnicode_KIND(str);
12198 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012199 if (kind == PyUnicode_1BYTE_KIND) {
12200 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012201 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012202 }
12203 else if (kind == PyUnicode_2BYTE_KIND) {
12204 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012205 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012206 ucs2[n] = fill_char;
12207 } else {
12208 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12209 assert(kind == PyUnicode_4BYTE_KIND);
12210 for (n = 0; n < len; ++n)
12211 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 }
12214 else {
12215 /* number of characters copied this far */
12216 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012217 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 char *to = (char *) PyUnicode_DATA(u);
12219 Py_MEMCPY(to, PyUnicode_DATA(str),
12220 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 n = (done <= nchars-done) ? done : nchars-done;
12223 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012224 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226 }
12227
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012228 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012229 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230}
12231
Alexander Belopolsky40018472011-02-26 01:02:56 +000012232PyObject *
12233PyUnicode_Replace(PyObject *obj,
12234 PyObject *subobj,
12235 PyObject *replobj,
12236 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237{
12238 PyObject *self;
12239 PyObject *str1;
12240 PyObject *str2;
12241 PyObject *result;
12242
12243 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012244 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012247 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 Py_DECREF(self);
12249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 }
12251 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012252 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 Py_DECREF(self);
12254 Py_DECREF(str1);
12255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012257 if (PyUnicode_READY(self) == -1 ||
12258 PyUnicode_READY(str1) == -1 ||
12259 PyUnicode_READY(str2) == -1)
12260 result = NULL;
12261 else
12262 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 Py_DECREF(self);
12264 Py_DECREF(str1);
12265 Py_DECREF(str2);
12266 return result;
12267}
12268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012269PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012270 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271\n\
12272Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012273old replaced by new. If the optional argument count is\n\
12274given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275
12276static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 PyObject *str1;
12280 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012281 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 PyObject *result;
12283
Martin v. Löwis18e16552006-02-15 17:27:45 +000012284 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012286 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012289 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 return NULL;
12291 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012292 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 Py_DECREF(str1);
12294 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012295 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012296 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12297 result = NULL;
12298 else
12299 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300
12301 Py_DECREF(str1);
12302 Py_DECREF(str2);
12303 return result;
12304}
12305
Alexander Belopolsky40018472011-02-26 01:02:56 +000012306static PyObject *
12307unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012309 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 Py_ssize_t isize;
12311 Py_ssize_t osize, squote, dquote, i, o;
12312 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012313 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012317 return NULL;
12318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 isize = PyUnicode_GET_LENGTH(unicode);
12320 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 /* Compute length of output, quote characters, and
12323 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012324 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 max = 127;
12326 squote = dquote = 0;
12327 ikind = PyUnicode_KIND(unicode);
12328 for (i = 0; i < isize; i++) {
12329 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012330 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012332 case '\'': squote++; break;
12333 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012335 incr = 2;
12336 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 default:
12338 /* Fast-path ASCII */
12339 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012340 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012342 ;
12343 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012346 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012348 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012350 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012352 if (osize > PY_SSIZE_T_MAX - incr) {
12353 PyErr_SetString(PyExc_OverflowError,
12354 "string is too long to generate repr");
12355 return NULL;
12356 }
12357 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 }
12359
12360 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012361 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012363 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 if (dquote)
12365 /* Both squote and dquote present. Use squote,
12366 and escape them */
12367 osize += squote;
12368 else
12369 quote = '"';
12370 }
Victor Stinner55c08782013-04-14 18:45:39 +020012371 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372
12373 repr = PyUnicode_New(osize, max);
12374 if (repr == NULL)
12375 return NULL;
12376 okind = PyUnicode_KIND(repr);
12377 odata = PyUnicode_DATA(repr);
12378
12379 PyUnicode_WRITE(okind, odata, 0, quote);
12380 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012381 if (unchanged) {
12382 _PyUnicode_FastCopyCharacters(repr, 1,
12383 unicode, 0,
12384 isize);
12385 }
12386 else {
12387 for (i = 0, o = 1; i < isize; i++) {
12388 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389
Victor Stinner55c08782013-04-14 18:45:39 +020012390 /* Escape quotes and backslashes */
12391 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012392 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012394 continue;
12395 }
12396
12397 /* Map special whitespace to '\t', \n', '\r' */
12398 if (ch == '\t') {
12399 PyUnicode_WRITE(okind, odata, o++, '\\');
12400 PyUnicode_WRITE(okind, odata, o++, 't');
12401 }
12402 else if (ch == '\n') {
12403 PyUnicode_WRITE(okind, odata, o++, '\\');
12404 PyUnicode_WRITE(okind, odata, o++, 'n');
12405 }
12406 else if (ch == '\r') {
12407 PyUnicode_WRITE(okind, odata, o++, '\\');
12408 PyUnicode_WRITE(okind, odata, o++, 'r');
12409 }
12410
12411 /* Map non-printable US ASCII to '\xhh' */
12412 else if (ch < ' ' || ch == 0x7F) {
12413 PyUnicode_WRITE(okind, odata, o++, '\\');
12414 PyUnicode_WRITE(okind, odata, o++, 'x');
12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12416 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12417 }
12418
12419 /* Copy ASCII characters as-is */
12420 else if (ch < 0x7F) {
12421 PyUnicode_WRITE(okind, odata, o++, ch);
12422 }
12423
12424 /* Non-ASCII characters */
12425 else {
12426 /* Map Unicode whitespace and control characters
12427 (categories Z* and C* except ASCII space)
12428 */
12429 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12430 PyUnicode_WRITE(okind, odata, o++, '\\');
12431 /* Map 8-bit characters to '\xhh' */
12432 if (ch <= 0xff) {
12433 PyUnicode_WRITE(okind, odata, o++, 'x');
12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12436 }
12437 /* Map 16-bit characters to '\uxxxx' */
12438 else if (ch <= 0xffff) {
12439 PyUnicode_WRITE(okind, odata, o++, 'u');
12440 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12443 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12444 }
12445 /* Map 21-bit characters to '\U00xxxxxx' */
12446 else {
12447 PyUnicode_WRITE(okind, odata, o++, 'U');
12448 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12449 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12456 }
12457 }
12458 /* Copy characters as-is */
12459 else {
12460 PyUnicode_WRITE(okind, odata, o++, ch);
12461 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012462 }
12463 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012466 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012467 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468}
12469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012470PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472\n\
12473Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012474such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475arguments start and end are interpreted as in slice notation.\n\
12476\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012477Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478
12479static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012482 /* initialize variables to prevent gcc warning */
12483 PyObject *substring = NULL;
12484 Py_ssize_t start = 0;
12485 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012486 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487
Jesus Ceaac451502011-04-20 17:09:23 +020012488 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12489 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
Christian Heimesea71a522013-06-29 21:17:34 +020012492 if (PyUnicode_READY(self) == -1) {
12493 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012495 }
12496 if (PyUnicode_READY(substring) == -1) {
12497 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500
Victor Stinner7931d9a2011-11-04 00:22:48 +010012501 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502
12503 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 if (result == -2)
12506 return NULL;
12507
Christian Heimes217cfd12007-12-02 14:31:20 +000012508 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509}
12510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012514Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012519 /* initialize variables to prevent gcc warning */
12520 PyObject *substring = NULL;
12521 Py_ssize_t start = 0;
12522 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012523 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
Jesus Ceaac451502011-04-20 17:09:23 +020012525 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12526 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
Christian Heimesea71a522013-06-29 21:17:34 +020012529 if (PyUnicode_READY(self) == -1) {
12530 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012532 }
12533 if (PyUnicode_READY(substring) == -1) {
12534 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537
Victor Stinner7931d9a2011-11-04 00:22:48 +010012538 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
12540 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 if (result == -2)
12543 return NULL;
12544
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545 if (result < 0) {
12546 PyErr_SetString(PyExc_ValueError, "substring not found");
12547 return NULL;
12548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549
Christian Heimes217cfd12007-12-02 14:31:20 +000012550 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551}
12552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012553PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012556Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012557done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558
12559static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012560unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012562 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 Py_UCS4 fillchar = ' ';
12564
Victor Stinnere9a29352011-10-01 02:14:59 +020012565 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012567
Benjamin Petersonbac79492012-01-14 13:34:47 -050012568 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569 return NULL;
12570
Victor Stinnerc4b49542011-12-11 22:44:26 +010012571 if (PyUnicode_GET_LENGTH(self) >= width)
12572 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
Victor Stinnerc4b49542011-12-11 22:44:26 +010012574 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575}
12576
Alexander Belopolsky40018472011-02-26 01:02:56 +000012577PyObject *
12578PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579{
12580 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012581
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582 s = PyUnicode_FromObject(s);
12583 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012584 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 if (sep != NULL) {
12586 sep = PyUnicode_FromObject(sep);
12587 if (sep == NULL) {
12588 Py_DECREF(s);
12589 return NULL;
12590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591 }
12592
Victor Stinner9310abb2011-10-05 00:59:23 +020012593 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594
12595 Py_DECREF(s);
12596 Py_XDECREF(sep);
12597 return result;
12598}
12599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012600PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012601 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602\n\
12603Return a list of the words in S, using sep as the\n\
12604delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012605splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012606whitespace string is a separator and empty strings are\n\
12607removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608
12609static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012610unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012612 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012614 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012616 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12617 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618 return NULL;
12619
12620 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012623 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012625 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626}
12627
Thomas Wouters477c8d52006-05-27 19:21:47 +000012628PyObject *
12629PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12630{
12631 PyObject* str_obj;
12632 PyObject* sep_obj;
12633 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012634 int kind1, kind2;
12635 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012637
12638 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012639 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012641 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012642 if (!sep_obj) {
12643 Py_DECREF(str_obj);
12644 return NULL;
12645 }
12646 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12647 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012648 Py_DECREF(str_obj);
12649 return NULL;
12650 }
12651
Victor Stinner14f8f022011-10-05 20:58:25 +020012652 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 len1 = PyUnicode_GET_LENGTH(str_obj);
12655 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012656 if (kind1 < kind2 || len1 < len2) {
12657 _Py_INCREF_UNICODE_EMPTY();
12658 if (!unicode_empty)
12659 out = NULL;
12660 else {
12661 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12662 Py_DECREF(unicode_empty);
12663 }
12664 Py_DECREF(sep_obj);
12665 Py_DECREF(str_obj);
12666 return out;
12667 }
12668 buf1 = PyUnicode_DATA(str_obj);
12669 buf2 = PyUnicode_DATA(sep_obj);
12670 if (kind2 != kind1) {
12671 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12672 if (!buf2)
12673 goto onError;
12674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012676 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012678 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12679 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12680 else
12681 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 break;
12683 case PyUnicode_2BYTE_KIND:
12684 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12685 break;
12686 case PyUnicode_4BYTE_KIND:
12687 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12688 break;
12689 default:
12690 assert(0);
12691 out = 0;
12692 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012693
12694 Py_DECREF(sep_obj);
12695 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012696 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012698
12699 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 onError:
12701 Py_DECREF(sep_obj);
12702 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012703 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 PyMem_Free(buf2);
12705 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012706}
12707
12708
12709PyObject *
12710PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12711{
12712 PyObject* str_obj;
12713 PyObject* sep_obj;
12714 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012715 int kind1, kind2;
12716 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012718
12719 str_obj = PyUnicode_FromObject(str_in);
12720 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012722 sep_obj = PyUnicode_FromObject(sep_in);
12723 if (!sep_obj) {
12724 Py_DECREF(str_obj);
12725 return NULL;
12726 }
12727
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012728 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 len1 = PyUnicode_GET_LENGTH(str_obj);
12731 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012732 if (kind1 < kind2 || len1 < len2) {
12733 _Py_INCREF_UNICODE_EMPTY();
12734 if (!unicode_empty)
12735 out = NULL;
12736 else {
12737 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12738 Py_DECREF(unicode_empty);
12739 }
12740 Py_DECREF(sep_obj);
12741 Py_DECREF(str_obj);
12742 return out;
12743 }
12744 buf1 = PyUnicode_DATA(str_obj);
12745 buf2 = PyUnicode_DATA(sep_obj);
12746 if (kind2 != kind1) {
12747 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12748 if (!buf2)
12749 goto onError;
12750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012752 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012754 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12755 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12756 else
12757 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 break;
12759 case PyUnicode_2BYTE_KIND:
12760 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12761 break;
12762 case PyUnicode_4BYTE_KIND:
12763 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12764 break;
12765 default:
12766 assert(0);
12767 out = 0;
12768 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012769
12770 Py_DECREF(sep_obj);
12771 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012772 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012774
12775 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 onError:
12777 Py_DECREF(sep_obj);
12778 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012779 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 PyMem_Free(buf2);
12781 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782}
12783
12784PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012785 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012787Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012789found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790
12791static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012792unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793{
Victor Stinner9310abb2011-10-05 00:59:23 +020012794 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795}
12796
12797PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012798 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012800Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012802separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803
12804static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012805unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806{
Victor Stinner9310abb2011-10-05 00:59:23 +020012807 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808}
12809
Alexander Belopolsky40018472011-02-26 01:02:56 +000012810PyObject *
12811PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012812{
12813 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012814
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012815 s = PyUnicode_FromObject(s);
12816 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 if (sep != NULL) {
12819 sep = PyUnicode_FromObject(sep);
12820 if (sep == NULL) {
12821 Py_DECREF(s);
12822 return NULL;
12823 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012824 }
12825
Victor Stinner9310abb2011-10-05 00:59:23 +020012826 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012827
12828 Py_DECREF(s);
12829 Py_XDECREF(sep);
12830 return result;
12831}
12832
12833PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012834 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835\n\
12836Return a list of the words in S, using sep as the\n\
12837delimiter string, starting at the end of the string and\n\
12838working to the front. If maxsplit is given, at most maxsplit\n\
12839splits are done. If sep is not specified, any whitespace string\n\
12840is a separator.");
12841
12842static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012843unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012845 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012847 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012848
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012849 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12850 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012851 return NULL;
12852
12853 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012856 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012858 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012859}
12860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012861PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863\n\
12864Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012865Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012866is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867
12868static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012869unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012871 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012872 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012874 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12875 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876 return NULL;
12877
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012878 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879}
12880
12881static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012882PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012884 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012887PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889\n\
12890Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012891and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892
12893static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012894unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012896 if (PyUnicode_READY(self) == -1)
12897 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012898 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899}
12900
Larry Hastings61272b72014-01-07 12:41:53 -080012901/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012902
Larry Hastings31826802013-10-19 00:09:25 -070012903@staticmethod
12904str.maketrans as unicode_maketrans
12905
12906 x: object
12907
12908 y: unicode=NULL
12909
12910 z: unicode=NULL
12911
12912 /
12913
12914Return a translation table usable for str.translate().
12915
12916If there is only one argument, it must be a dictionary mapping Unicode
12917ordinals (integers) or characters to Unicode ordinals, strings or None.
12918Character keys will be then converted to ordinals.
12919If there are two arguments, they must be strings of equal length, and
12920in the resulting dictionary, each character in x will be mapped to the
12921character at the same position in y. If there is a third argument, it
12922must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012923[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012924
Larry Hastings31826802013-10-19 00:09:25 -070012925static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012926unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012927/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012928{
Georg Brandlceee0772007-11-27 23:48:05 +000012929 PyObject *new = NULL, *key, *value;
12930 Py_ssize_t i = 0;
12931 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012932
Georg Brandlceee0772007-11-27 23:48:05 +000012933 new = PyDict_New();
12934 if (!new)
12935 return NULL;
12936 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 int x_kind, y_kind, z_kind;
12938 void *x_data, *y_data, *z_data;
12939
Georg Brandlceee0772007-11-27 23:48:05 +000012940 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012941 if (!PyUnicode_Check(x)) {
12942 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12943 "be a string if there is a second argument");
12944 goto err;
12945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012947 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12948 "arguments must have equal length");
12949 goto err;
12950 }
12951 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 x_kind = PyUnicode_KIND(x);
12953 y_kind = PyUnicode_KIND(y);
12954 x_data = PyUnicode_DATA(x);
12955 y_data = PyUnicode_DATA(y);
12956 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12957 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012958 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012959 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012960 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012961 if (!value) {
12962 Py_DECREF(key);
12963 goto err;
12964 }
Georg Brandlceee0772007-11-27 23:48:05 +000012965 res = PyDict_SetItem(new, key, value);
12966 Py_DECREF(key);
12967 Py_DECREF(value);
12968 if (res < 0)
12969 goto err;
12970 }
12971 /* create entries for deleting chars in z */
12972 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 z_kind = PyUnicode_KIND(z);
12974 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012975 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012977 if (!key)
12978 goto err;
12979 res = PyDict_SetItem(new, key, Py_None);
12980 Py_DECREF(key);
12981 if (res < 0)
12982 goto err;
12983 }
12984 }
12985 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 int kind;
12987 void *data;
12988
Georg Brandlceee0772007-11-27 23:48:05 +000012989 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012990 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012991 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12992 "to maketrans it must be a dict");
12993 goto err;
12994 }
12995 /* copy entries into the new dict, converting string keys to int keys */
12996 while (PyDict_Next(x, &i, &key, &value)) {
12997 if (PyUnicode_Check(key)) {
12998 /* convert string keys to integer keys */
12999 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013000 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013001 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13002 "table must be of length 1");
13003 goto err;
13004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 kind = PyUnicode_KIND(key);
13006 data = PyUnicode_DATA(key);
13007 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013008 if (!newkey)
13009 goto err;
13010 res = PyDict_SetItem(new, newkey, value);
13011 Py_DECREF(newkey);
13012 if (res < 0)
13013 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013014 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013015 /* just keep integer keys */
13016 if (PyDict_SetItem(new, key, value) < 0)
13017 goto err;
13018 } else {
13019 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13020 "be strings or integers");
13021 goto err;
13022 }
13023 }
13024 }
13025 return new;
13026 err:
13027 Py_DECREF(new);
13028 return NULL;
13029}
13030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013031PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013032 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013034Return a copy of the string S in which each character has been mapped\n\
13035through the given translation table. The table must implement\n\
13036lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13037mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13038this operation raises LookupError, the character is left untouched.\n\
13039Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040
13041static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045}
13046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013047PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013050Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051
13052static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013053unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013055 if (PyUnicode_READY(self) == -1)
13056 return NULL;
13057 if (PyUnicode_IS_ASCII(self))
13058 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013059 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060}
13061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013062PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013063 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013065Pad a numeric string S with zeros on the left, to fill a field\n\
13066of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067
13068static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013069unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013071 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013072 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013073 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 int kind;
13075 void *data;
13076 Py_UCS4 chr;
13077
Martin v. Löwis18e16552006-02-15 17:27:45 +000013078 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079 return NULL;
13080
Benjamin Petersonbac79492012-01-14 13:34:47 -050013081 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083
Victor Stinnerc4b49542011-12-11 22:44:26 +010013084 if (PyUnicode_GET_LENGTH(self) >= width)
13085 return unicode_result_unchanged(self);
13086
13087 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088
13089 u = pad(self, fill, 0, '0');
13090
Walter Dörwald068325e2002-04-15 13:36:47 +000013091 if (u == NULL)
13092 return NULL;
13093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 kind = PyUnicode_KIND(u);
13095 data = PyUnicode_DATA(u);
13096 chr = PyUnicode_READ(kind, data, fill);
13097
13098 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 PyUnicode_WRITE(kind, data, 0, chr);
13101 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102 }
13103
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013104 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013105 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107
13108#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013109static PyObject *
13110unicode__decimal2ascii(PyObject *self)
13111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013113}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114#endif
13115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013116PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013119Return True if S starts with the specified prefix, False otherwise.\n\
13120With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013121With optional end, stop comparing S at that position.\n\
13122prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123
13124static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013125unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013128 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013129 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013130 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013131 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013132 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133
Jesus Ceaac451502011-04-20 17:09:23 +020013134 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013136 if (PyTuple_Check(subobj)) {
13137 Py_ssize_t i;
13138 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013139 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013140 if (substring == NULL)
13141 return NULL;
13142 result = tailmatch(self, substring, start, end, -1);
13143 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013144 if (result == -1)
13145 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013146 if (result) {
13147 Py_RETURN_TRUE;
13148 }
13149 }
13150 /* nothing matched */
13151 Py_RETURN_FALSE;
13152 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013153 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013154 if (substring == NULL) {
13155 if (PyErr_ExceptionMatches(PyExc_TypeError))
13156 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13157 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013159 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013160 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013162 if (result == -1)
13163 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013164 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165}
13166
13167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013168PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013169 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013171Return True if S ends with the specified suffix, False otherwise.\n\
13172With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013173With optional end, stop comparing S at that position.\n\
13174suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175
13176static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013177unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013180 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013181 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013182 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013183 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013184 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185
Jesus Ceaac451502011-04-20 17:09:23 +020013186 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013187 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013188 if (PyTuple_Check(subobj)) {
13189 Py_ssize_t i;
13190 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013191 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013193 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013195 result = tailmatch(self, substring, start, end, +1);
13196 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013197 if (result == -1)
13198 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 if (result) {
13200 Py_RETURN_TRUE;
13201 }
13202 }
13203 Py_RETURN_FALSE;
13204 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013205 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013206 if (substring == NULL) {
13207 if (PyErr_ExceptionMatches(PyExc_TypeError))
13208 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13209 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013211 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013213 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013214 if (result == -1)
13215 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013216 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217}
13218
Victor Stinner202fdca2012-05-07 12:47:02 +020013219Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013220_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013221{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013222 if (!writer->readonly)
13223 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13224 else {
13225 /* Copy-on-write mode: set buffer size to 0 so
13226 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13227 * next write. */
13228 writer->size = 0;
13229 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013230 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13231 writer->data = PyUnicode_DATA(writer->buffer);
13232 writer->kind = PyUnicode_KIND(writer->buffer);
13233}
13234
Victor Stinnerd3f08822012-05-29 12:57:52 +020013235void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013236_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013237{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013238 memset(writer, 0, sizeof(*writer));
13239#ifdef Py_DEBUG
13240 writer->kind = 5; /* invalid kind */
13241#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013242 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013243}
13244
Victor Stinnerd3f08822012-05-29 12:57:52 +020013245int
13246_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13247 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013248{
Victor Stinner6989ba02013-11-18 21:08:39 +010013249#ifdef MS_WINDOWS
13250 /* On Windows, overallocate by 50% is the best factor */
13251# define OVERALLOCATE_FACTOR 2
13252#else
13253 /* On Linux, overallocate by 25% is the best factor */
13254# define OVERALLOCATE_FACTOR 4
13255#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013256 Py_ssize_t newlen;
13257 PyObject *newbuffer;
13258
Victor Stinnerd3f08822012-05-29 12:57:52 +020013259 assert(length > 0);
13260
Victor Stinner202fdca2012-05-07 12:47:02 +020013261 if (length > PY_SSIZE_T_MAX - writer->pos) {
13262 PyErr_NoMemory();
13263 return -1;
13264 }
13265 newlen = writer->pos + length;
13266
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013267 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013268
Victor Stinnerd3f08822012-05-29 12:57:52 +020013269 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013270 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013271 if (writer->overallocate
13272 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13273 /* overallocate to limit the number of realloc() */
13274 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013275 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013276 if (newlen < writer->min_length)
13277 newlen = writer->min_length;
13278
Victor Stinnerd3f08822012-05-29 12:57:52 +020013279 writer->buffer = PyUnicode_New(newlen, maxchar);
13280 if (writer->buffer == NULL)
13281 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013282 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013283 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013284 if (writer->overallocate
13285 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13286 /* overallocate to limit the number of realloc() */
13287 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013289 if (newlen < writer->min_length)
13290 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013292 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013293 /* resize + widen */
13294 newbuffer = PyUnicode_New(newlen, maxchar);
13295 if (newbuffer == NULL)
13296 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13298 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013299 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013300 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013301 }
13302 else {
13303 newbuffer = resize_compact(writer->buffer, newlen);
13304 if (newbuffer == NULL)
13305 return -1;
13306 }
13307 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013308 }
13309 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013310 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013311 newbuffer = PyUnicode_New(writer->size, maxchar);
13312 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013313 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13315 writer->buffer, 0, writer->pos);
13316 Py_DECREF(writer->buffer);
13317 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013318 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013319 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013320 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013321
13322#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013323}
13324
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013325Py_LOCAL_INLINE(int)
13326_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013327{
13328 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13329 return -1;
13330 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13331 writer->pos++;
13332 return 0;
13333}
13334
13335int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013336_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13337{
13338 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13339}
13340
13341int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013342_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13343{
13344 Py_UCS4 maxchar;
13345 Py_ssize_t len;
13346
13347 if (PyUnicode_READY(str) == -1)
13348 return -1;
13349 len = PyUnicode_GET_LENGTH(str);
13350 if (len == 0)
13351 return 0;
13352 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13353 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013354 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013355 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013356 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013357 Py_INCREF(str);
13358 writer->buffer = str;
13359 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013360 writer->pos += len;
13361 return 0;
13362 }
13363 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13364 return -1;
13365 }
13366 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13367 str, 0, len);
13368 writer->pos += len;
13369 return 0;
13370}
13371
Victor Stinnere215d962012-10-06 23:03:36 +020013372int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013373_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13374 Py_ssize_t start, Py_ssize_t end)
13375{
13376 Py_UCS4 maxchar;
13377 Py_ssize_t len;
13378
13379 if (PyUnicode_READY(str) == -1)
13380 return -1;
13381
13382 assert(0 <= start);
13383 assert(end <= PyUnicode_GET_LENGTH(str));
13384 assert(start <= end);
13385
13386 if (end == 0)
13387 return 0;
13388
13389 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13390 return _PyUnicodeWriter_WriteStr(writer, str);
13391
13392 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13393 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13394 else
13395 maxchar = writer->maxchar;
13396 len = end - start;
13397
13398 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13399 return -1;
13400
13401 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13402 str, start, len);
13403 writer->pos += len;
13404 return 0;
13405}
13406
13407int
Victor Stinner4a587072013-11-19 12:54:53 +010013408_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13409 const char *ascii, Py_ssize_t len)
13410{
13411 if (len == -1)
13412 len = strlen(ascii);
13413
13414 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13415
13416 if (writer->buffer == NULL && !writer->overallocate) {
13417 PyObject *str;
13418
13419 str = _PyUnicode_FromASCII(ascii, len);
13420 if (str == NULL)
13421 return -1;
13422
13423 writer->readonly = 1;
13424 writer->buffer = str;
13425 _PyUnicodeWriter_Update(writer);
13426 writer->pos += len;
13427 return 0;
13428 }
13429
13430 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13431 return -1;
13432
13433 switch (writer->kind)
13434 {
13435 case PyUnicode_1BYTE_KIND:
13436 {
13437 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13438 Py_UCS1 *data = writer->data;
13439
13440 Py_MEMCPY(data + writer->pos, str, len);
13441 break;
13442 }
13443 case PyUnicode_2BYTE_KIND:
13444 {
13445 _PyUnicode_CONVERT_BYTES(
13446 Py_UCS1, Py_UCS2,
13447 ascii, ascii + len,
13448 (Py_UCS2 *)writer->data + writer->pos);
13449 break;
13450 }
13451 case PyUnicode_4BYTE_KIND:
13452 {
13453 _PyUnicode_CONVERT_BYTES(
13454 Py_UCS1, Py_UCS4,
13455 ascii, ascii + len,
13456 (Py_UCS4 *)writer->data + writer->pos);
13457 break;
13458 }
13459 default:
13460 assert(0);
13461 }
13462
13463 writer->pos += len;
13464 return 0;
13465}
13466
13467int
13468_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13469 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013470{
13471 Py_UCS4 maxchar;
13472
13473 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13474 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13475 return -1;
13476 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13477 writer->pos += len;
13478 return 0;
13479}
13480
Victor Stinnerd3f08822012-05-29 12:57:52 +020013481PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013482_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013483{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013484 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013486 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013487 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013488 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013489 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013490 str = writer->buffer;
13491 writer->buffer = NULL;
13492 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13493 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494 }
13495 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13496 PyObject *newbuffer;
13497 newbuffer = resize_compact(writer->buffer, writer->pos);
13498 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013499 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500 return NULL;
13501 }
13502 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013503 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013504 str = writer->buffer;
13505 writer->buffer = NULL;
13506 assert(_PyUnicode_CheckConsistency(str, 1));
13507 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013508}
13509
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013511_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013512{
13513 Py_CLEAR(writer->buffer);
13514}
13515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013516#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013517
13518PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013520\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013521Return a formatted version of S, using substitutions from args and kwargs.\n\
13522The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013523
Eric Smith27bbca62010-11-04 17:06:58 +000013524PyDoc_STRVAR(format_map__doc__,
13525 "S.format_map(mapping) -> str\n\
13526\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013527Return a formatted version of S, using substitutions from mapping.\n\
13528The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013529
Eric Smith4a7d76d2008-05-30 18:10:19 +000013530static PyObject *
13531unicode__format__(PyObject* self, PyObject* args)
13532{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013533 PyObject *format_spec;
13534 _PyUnicodeWriter writer;
13535 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013536
13537 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13538 return NULL;
13539
Victor Stinnerd3f08822012-05-29 12:57:52 +020013540 if (PyUnicode_READY(self) == -1)
13541 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013542 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13544 self, format_spec, 0,
13545 PyUnicode_GET_LENGTH(format_spec));
13546 if (ret == -1) {
13547 _PyUnicodeWriter_Dealloc(&writer);
13548 return NULL;
13549 }
13550 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013551}
13552
Eric Smith8c663262007-08-25 02:26:07 +000013553PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013555\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013556Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013557
13558static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013559unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013561 Py_ssize_t size;
13562
13563 /* If it's a compact object, account for base structure +
13564 character data. */
13565 if (PyUnicode_IS_COMPACT_ASCII(v))
13566 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13567 else if (PyUnicode_IS_COMPACT(v))
13568 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013569 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013570 else {
13571 /* If it is a two-block object, account for base object, and
13572 for character block if present. */
13573 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013574 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013575 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013576 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013577 }
13578 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013579 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013580 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013582 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013583 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013584
13585 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013586}
13587
13588PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013590
13591static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013592unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013593{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013594 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013595 if (!copy)
13596 return NULL;
13597 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013598}
13599
Guido van Rossumd57fd912000-03-10 22:53:23 +000013600static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013601 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013602 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013603 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13604 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013605 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13606 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013607 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013608 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13609 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13610 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013611 {"expandtabs", (PyCFunction) unicode_expandtabs,
13612 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013613 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013614 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013615 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13616 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13617 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013618 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013619 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13620 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13621 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013622 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013623 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013624 {"splitlines", (PyCFunction) unicode_splitlines,
13625 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013626 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013627 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13628 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13629 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13630 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13631 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13632 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13633 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13634 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13635 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13636 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13637 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13638 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13639 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13640 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013641 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013642 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013643 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013644 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013645 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013646 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013647 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013648 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013649#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013650 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013651 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013652#endif
13653
Benjamin Peterson14339b62009-01-31 16:36:08 +000013654 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013655 {NULL, NULL}
13656};
13657
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013658static PyObject *
13659unicode_mod(PyObject *v, PyObject *w)
13660{
Brian Curtindfc80e32011-08-10 20:28:54 -050013661 if (!PyUnicode_Check(v))
13662 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013664}
13665
13666static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013667 0, /*nb_add*/
13668 0, /*nb_subtract*/
13669 0, /*nb_multiply*/
13670 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013671};
13672
Guido van Rossumd57fd912000-03-10 22:53:23 +000013673static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013674 (lenfunc) unicode_length, /* sq_length */
13675 PyUnicode_Concat, /* sq_concat */
13676 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13677 (ssizeargfunc) unicode_getitem, /* sq_item */
13678 0, /* sq_slice */
13679 0, /* sq_ass_item */
13680 0, /* sq_ass_slice */
13681 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682};
13683
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013684static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013685unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013687 if (PyUnicode_READY(self) == -1)
13688 return NULL;
13689
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013690 if (PyIndex_Check(item)) {
13691 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013692 if (i == -1 && PyErr_Occurred())
13693 return NULL;
13694 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013695 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013696 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013697 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013698 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013699 PyObject *result;
13700 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013701 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013702 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013704 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013706 return NULL;
13707 }
13708
13709 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013710 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013711 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013712 slicelength == PyUnicode_GET_LENGTH(self)) {
13713 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013714 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013715 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013716 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013717 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013718 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013719 src_kind = PyUnicode_KIND(self);
13720 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013721 if (!PyUnicode_IS_ASCII(self)) {
13722 kind_limit = kind_maxchar_limit(src_kind);
13723 max_char = 0;
13724 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13725 ch = PyUnicode_READ(src_kind, src_data, cur);
13726 if (ch > max_char) {
13727 max_char = ch;
13728 if (max_char >= kind_limit)
13729 break;
13730 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013731 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013732 }
Victor Stinner55c99112011-10-13 01:17:06 +020013733 else
13734 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013735 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013736 if (result == NULL)
13737 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013738 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013739 dest_data = PyUnicode_DATA(result);
13740
13741 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013742 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13743 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013744 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013745 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013746 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013747 } else {
13748 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13749 return NULL;
13750 }
13751}
13752
13753static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013754 (lenfunc)unicode_length, /* mp_length */
13755 (binaryfunc)unicode_subscript, /* mp_subscript */
13756 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013757};
13758
Guido van Rossumd57fd912000-03-10 22:53:23 +000013759
Guido van Rossumd57fd912000-03-10 22:53:23 +000013760/* Helpers for PyUnicode_Format() */
13761
Victor Stinnera47082312012-10-04 02:19:54 +020013762struct unicode_formatter_t {
13763 PyObject *args;
13764 int args_owned;
13765 Py_ssize_t arglen, argidx;
13766 PyObject *dict;
13767
13768 enum PyUnicode_Kind fmtkind;
13769 Py_ssize_t fmtcnt, fmtpos;
13770 void *fmtdata;
13771 PyObject *fmtstr;
13772
13773 _PyUnicodeWriter writer;
13774};
13775
13776struct unicode_format_arg_t {
13777 Py_UCS4 ch;
13778 int flags;
13779 Py_ssize_t width;
13780 int prec;
13781 int sign;
13782};
13783
Guido van Rossumd57fd912000-03-10 22:53:23 +000013784static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013785unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786{
Victor Stinnera47082312012-10-04 02:19:54 +020013787 Py_ssize_t argidx = ctx->argidx;
13788
13789 if (argidx < ctx->arglen) {
13790 ctx->argidx++;
13791 if (ctx->arglen < 0)
13792 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013793 else
Victor Stinnera47082312012-10-04 02:19:54 +020013794 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013795 }
13796 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013797 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798 return NULL;
13799}
13800
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013801/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013802
Victor Stinnera47082312012-10-04 02:19:54 +020013803/* Format a float into the writer if the writer is not NULL, or into *p_output
13804 otherwise.
13805
13806 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013807static int
Victor Stinnera47082312012-10-04 02:19:54 +020013808formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13809 PyObject **p_output,
13810 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013812 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013814 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013815 int prec;
13816 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013817
Guido van Rossumd57fd912000-03-10 22:53:23 +000013818 x = PyFloat_AsDouble(v);
13819 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013820 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013821
Victor Stinnera47082312012-10-04 02:19:54 +020013822 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013824 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013825
Victor Stinnera47082312012-10-04 02:19:54 +020013826 if (arg->flags & F_ALT)
13827 dtoa_flags = Py_DTSF_ALT;
13828 else
13829 dtoa_flags = 0;
13830 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013831 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013832 return -1;
13833 len = strlen(p);
13834 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013835 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013836 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013837 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013838 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013839 }
13840 else
13841 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013842 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013843 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013844}
13845
Victor Stinnerd0880d52012-04-27 23:40:13 +020013846/* formatlong() emulates the format codes d, u, o, x and X, and
13847 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13848 * Python's regular ints.
13849 * Return value: a new PyUnicodeObject*, or NULL if error.
13850 * The output string is of the form
13851 * "-"? ("0x" | "0X")? digit+
13852 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13853 * set in flags. The case of hex digits will be correct,
13854 * There will be at least prec digits, zero-filled on the left if
13855 * necessary to get that many.
13856 * val object to be converted
13857 * flags bitmask of format flags; only F_ALT is looked at
13858 * prec minimum number of digits; 0-fill on left if needed
13859 * type a character in [duoxX]; u acts the same as d
13860 *
13861 * CAUTION: o, x and X conversions on regular ints can never
13862 * produce a '-' sign, but can for Python's unbounded ints.
13863 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013864PyObject *
13865_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013866{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013867 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013868 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013869 Py_ssize_t i;
13870 int sign; /* 1 if '-', else 0 */
13871 int len; /* number of characters */
13872 Py_ssize_t llen;
13873 int numdigits; /* len == numnondigits + numdigits */
13874 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013875
Victor Stinnerd0880d52012-04-27 23:40:13 +020013876 /* Avoid exceeding SSIZE_T_MAX */
13877 if (prec > INT_MAX-3) {
13878 PyErr_SetString(PyExc_OverflowError,
13879 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013881 }
13882
13883 assert(PyLong_Check(val));
13884
13885 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013886 default:
13887 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013888 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013889 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013890 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013891 /* int and int subclasses should print numerically when a numeric */
13892 /* format code is used (see issue18780) */
13893 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013894 break;
13895 case 'o':
13896 numnondigits = 2;
13897 result = PyNumber_ToBase(val, 8);
13898 break;
13899 case 'x':
13900 case 'X':
13901 numnondigits = 2;
13902 result = PyNumber_ToBase(val, 16);
13903 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013904 }
13905 if (!result)
13906 return NULL;
13907
13908 assert(unicode_modifiable(result));
13909 assert(PyUnicode_IS_READY(result));
13910 assert(PyUnicode_IS_ASCII(result));
13911
13912 /* To modify the string in-place, there can only be one reference. */
13913 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013914 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013915 PyErr_BadInternalCall();
13916 return NULL;
13917 }
13918 buf = PyUnicode_DATA(result);
13919 llen = PyUnicode_GET_LENGTH(result);
13920 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013921 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013922 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013923 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013924 return NULL;
13925 }
13926 len = (int)llen;
13927 sign = buf[0] == '-';
13928 numnondigits += sign;
13929 numdigits = len - numnondigits;
13930 assert(numdigits > 0);
13931
13932 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013933 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013934 (type == 'o' || type == 'x' || type == 'X'))) {
13935 assert(buf[sign] == '0');
13936 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13937 buf[sign+1] == 'o');
13938 numnondigits -= 2;
13939 buf += 2;
13940 len -= 2;
13941 if (sign)
13942 buf[0] = '-';
13943 assert(len == numnondigits + numdigits);
13944 assert(numdigits > 0);
13945 }
13946
13947 /* Fill with leading zeroes to meet minimum width. */
13948 if (prec > numdigits) {
13949 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13950 numnondigits + prec);
13951 char *b1;
13952 if (!r1) {
13953 Py_DECREF(result);
13954 return NULL;
13955 }
13956 b1 = PyBytes_AS_STRING(r1);
13957 for (i = 0; i < numnondigits; ++i)
13958 *b1++ = *buf++;
13959 for (i = 0; i < prec - numdigits; i++)
13960 *b1++ = '0';
13961 for (i = 0; i < numdigits; i++)
13962 *b1++ = *buf++;
13963 *b1 = '\0';
13964 Py_DECREF(result);
13965 result = r1;
13966 buf = PyBytes_AS_STRING(result);
13967 len = numnondigits + prec;
13968 }
13969
13970 /* Fix up case for hex conversions. */
13971 if (type == 'X') {
13972 /* Need to convert all lower case letters to upper case.
13973 and need to convert 0x to 0X (and -0x to -0X). */
13974 for (i = 0; i < len; i++)
13975 if (buf[i] >= 'a' && buf[i] <= 'x')
13976 buf[i] -= 'a'-'A';
13977 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013978 if (!PyUnicode_Check(result)
13979 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013980 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013981 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013982 Py_DECREF(result);
13983 result = unicode;
13984 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013985 else if (len != PyUnicode_GET_LENGTH(result)) {
13986 if (PyUnicode_Resize(&result, len) < 0)
13987 Py_CLEAR(result);
13988 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013990}
13991
Ethan Furmandf3ed242014-01-05 06:50:30 -080013992/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020013993 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013994 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013995 * -1 and raise an exception on error */
13996static int
Victor Stinnera47082312012-10-04 02:19:54 +020013997mainformatlong(PyObject *v,
13998 struct unicode_format_arg_t *arg,
13999 PyObject **p_output,
14000 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014001{
14002 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014003 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014004
14005 if (!PyNumber_Check(v))
14006 goto wrongtype;
14007
Ethan Furman9ab74802014-03-21 06:38:46 -070014008 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014009 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014010 if (type == 'o' || type == 'x' || type == 'X') {
14011 iobj = PyNumber_Index(v);
14012 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014013 if (PyErr_ExceptionMatches(PyExc_TypeError))
14014 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014015 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014016 }
14017 }
14018 else {
14019 iobj = PyNumber_Long(v);
14020 if (iobj == NULL ) {
14021 if (PyErr_ExceptionMatches(PyExc_TypeError))
14022 goto wrongtype;
14023 return -1;
14024 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014025 }
14026 assert(PyLong_Check(iobj));
14027 }
14028 else {
14029 iobj = v;
14030 Py_INCREF(iobj);
14031 }
14032
14033 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014034 && arg->width == -1 && arg->prec == -1
14035 && !(arg->flags & (F_SIGN | F_BLANK))
14036 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014037 {
14038 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014039 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014040 int base;
14041
Victor Stinnera47082312012-10-04 02:19:54 +020014042 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014043 {
14044 default:
14045 assert(0 && "'type' not in [diuoxX]");
14046 case 'd':
14047 case 'i':
14048 case 'u':
14049 base = 10;
14050 break;
14051 case 'o':
14052 base = 8;
14053 break;
14054 case 'x':
14055 case 'X':
14056 base = 16;
14057 break;
14058 }
14059
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014060 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14061 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014062 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014063 }
14064 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014065 return 1;
14066 }
14067
Ethan Furmanb95b5612015-01-23 20:05:18 -080014068 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014069 Py_DECREF(iobj);
14070 if (res == NULL)
14071 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014072 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014073 return 0;
14074
14075wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014076 switch(type)
14077 {
14078 case 'o':
14079 case 'x':
14080 case 'X':
14081 PyErr_Format(PyExc_TypeError,
14082 "%%%c format: an integer is required, "
14083 "not %.200s",
14084 type, Py_TYPE(v)->tp_name);
14085 break;
14086 default:
14087 PyErr_Format(PyExc_TypeError,
14088 "%%%c format: a number is required, "
14089 "not %.200s",
14090 type, Py_TYPE(v)->tp_name);
14091 break;
14092 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014093 return -1;
14094}
14095
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014096static Py_UCS4
14097formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014099 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014100 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014101 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014102 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014103 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014104 goto onError;
14105 }
14106 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014107 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014108 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014109 /* make sure number is a type of integer */
14110 if (!PyLong_Check(v)) {
14111 iobj = PyNumber_Index(v);
14112 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014113 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014114 }
14115 v = iobj;
14116 Py_DECREF(iobj);
14117 }
14118 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014119 x = PyLong_AsLong(v);
14120 if (x == -1 && PyErr_Occurred())
14121 goto onError;
14122
Victor Stinner8faf8212011-12-08 22:14:11 +010014123 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014124 PyErr_SetString(PyExc_OverflowError,
14125 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014126 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014127 }
14128
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014129 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014130 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014131
Benjamin Peterson29060642009-01-31 22:14:21 +000014132 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014133 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014134 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014135 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014136}
14137
Victor Stinnera47082312012-10-04 02:19:54 +020014138/* Parse options of an argument: flags, width, precision.
14139 Handle also "%(name)" syntax.
14140
14141 Return 0 if the argument has been formatted into arg->str.
14142 Return 1 if the argument has been written into ctx->writer,
14143 Raise an exception and return -1 on error. */
14144static int
14145unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14146 struct unicode_format_arg_t *arg)
14147{
14148#define FORMAT_READ(ctx) \
14149 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14150
14151 PyObject *v;
14152
Victor Stinnera47082312012-10-04 02:19:54 +020014153 if (arg->ch == '(') {
14154 /* Get argument value from a dictionary. Example: "%(name)s". */
14155 Py_ssize_t keystart;
14156 Py_ssize_t keylen;
14157 PyObject *key;
14158 int pcount = 1;
14159
14160 if (ctx->dict == NULL) {
14161 PyErr_SetString(PyExc_TypeError,
14162 "format requires a mapping");
14163 return -1;
14164 }
14165 ++ctx->fmtpos;
14166 --ctx->fmtcnt;
14167 keystart = ctx->fmtpos;
14168 /* Skip over balanced parentheses */
14169 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14170 arg->ch = FORMAT_READ(ctx);
14171 if (arg->ch == ')')
14172 --pcount;
14173 else if (arg->ch == '(')
14174 ++pcount;
14175 ctx->fmtpos++;
14176 }
14177 keylen = ctx->fmtpos - keystart - 1;
14178 if (ctx->fmtcnt < 0 || pcount > 0) {
14179 PyErr_SetString(PyExc_ValueError,
14180 "incomplete format key");
14181 return -1;
14182 }
14183 key = PyUnicode_Substring(ctx->fmtstr,
14184 keystart, keystart + keylen);
14185 if (key == NULL)
14186 return -1;
14187 if (ctx->args_owned) {
14188 Py_DECREF(ctx->args);
14189 ctx->args_owned = 0;
14190 }
14191 ctx->args = PyObject_GetItem(ctx->dict, key);
14192 Py_DECREF(key);
14193 if (ctx->args == NULL)
14194 return -1;
14195 ctx->args_owned = 1;
14196 ctx->arglen = -1;
14197 ctx->argidx = -2;
14198 }
14199
14200 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014201 while (--ctx->fmtcnt >= 0) {
14202 arg->ch = FORMAT_READ(ctx);
14203 ctx->fmtpos++;
14204 switch (arg->ch) {
14205 case '-': arg->flags |= F_LJUST; continue;
14206 case '+': arg->flags |= F_SIGN; continue;
14207 case ' ': arg->flags |= F_BLANK; continue;
14208 case '#': arg->flags |= F_ALT; continue;
14209 case '0': arg->flags |= F_ZERO; continue;
14210 }
14211 break;
14212 }
14213
14214 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014215 if (arg->ch == '*') {
14216 v = unicode_format_getnextarg(ctx);
14217 if (v == NULL)
14218 return -1;
14219 if (!PyLong_Check(v)) {
14220 PyErr_SetString(PyExc_TypeError,
14221 "* wants int");
14222 return -1;
14223 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014224 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014225 if (arg->width == -1 && PyErr_Occurred())
14226 return -1;
14227 if (arg->width < 0) {
14228 arg->flags |= F_LJUST;
14229 arg->width = -arg->width;
14230 }
14231 if (--ctx->fmtcnt >= 0) {
14232 arg->ch = FORMAT_READ(ctx);
14233 ctx->fmtpos++;
14234 }
14235 }
14236 else if (arg->ch >= '0' && arg->ch <= '9') {
14237 arg->width = arg->ch - '0';
14238 while (--ctx->fmtcnt >= 0) {
14239 arg->ch = FORMAT_READ(ctx);
14240 ctx->fmtpos++;
14241 if (arg->ch < '0' || arg->ch > '9')
14242 break;
14243 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14244 mixing signed and unsigned comparison. Since arg->ch is between
14245 '0' and '9', casting to int is safe. */
14246 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14247 PyErr_SetString(PyExc_ValueError,
14248 "width too big");
14249 return -1;
14250 }
14251 arg->width = arg->width*10 + (arg->ch - '0');
14252 }
14253 }
14254
14255 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014256 if (arg->ch == '.') {
14257 arg->prec = 0;
14258 if (--ctx->fmtcnt >= 0) {
14259 arg->ch = FORMAT_READ(ctx);
14260 ctx->fmtpos++;
14261 }
14262 if (arg->ch == '*') {
14263 v = unicode_format_getnextarg(ctx);
14264 if (v == NULL)
14265 return -1;
14266 if (!PyLong_Check(v)) {
14267 PyErr_SetString(PyExc_TypeError,
14268 "* wants int");
14269 return -1;
14270 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014271 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014272 if (arg->prec == -1 && PyErr_Occurred())
14273 return -1;
14274 if (arg->prec < 0)
14275 arg->prec = 0;
14276 if (--ctx->fmtcnt >= 0) {
14277 arg->ch = FORMAT_READ(ctx);
14278 ctx->fmtpos++;
14279 }
14280 }
14281 else if (arg->ch >= '0' && arg->ch <= '9') {
14282 arg->prec = arg->ch - '0';
14283 while (--ctx->fmtcnt >= 0) {
14284 arg->ch = FORMAT_READ(ctx);
14285 ctx->fmtpos++;
14286 if (arg->ch < '0' || arg->ch > '9')
14287 break;
14288 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14289 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014290 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014291 return -1;
14292 }
14293 arg->prec = arg->prec*10 + (arg->ch - '0');
14294 }
14295 }
14296 }
14297
14298 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14299 if (ctx->fmtcnt >= 0) {
14300 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14301 if (--ctx->fmtcnt >= 0) {
14302 arg->ch = FORMAT_READ(ctx);
14303 ctx->fmtpos++;
14304 }
14305 }
14306 }
14307 if (ctx->fmtcnt < 0) {
14308 PyErr_SetString(PyExc_ValueError,
14309 "incomplete format");
14310 return -1;
14311 }
14312 return 0;
14313
14314#undef FORMAT_READ
14315}
14316
14317/* Format one argument. Supported conversion specifiers:
14318
14319 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014320 - "i", "d", "u": int or float
14321 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014322 - "e", "E", "f", "F", "g", "G": float
14323 - "c": int or str (1 character)
14324
Victor Stinner8dbd4212012-12-04 09:30:24 +010014325 When possible, the output is written directly into the Unicode writer
14326 (ctx->writer). A string is created when padding is required.
14327
Victor Stinnera47082312012-10-04 02:19:54 +020014328 Return 0 if the argument has been formatted into *p_str,
14329 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014330 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014331static int
14332unicode_format_arg_format(struct unicode_formatter_t *ctx,
14333 struct unicode_format_arg_t *arg,
14334 PyObject **p_str)
14335{
14336 PyObject *v;
14337 _PyUnicodeWriter *writer = &ctx->writer;
14338
14339 if (ctx->fmtcnt == 0)
14340 ctx->writer.overallocate = 0;
14341
14342 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014343 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014344 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014345 return 1;
14346 }
14347
14348 v = unicode_format_getnextarg(ctx);
14349 if (v == NULL)
14350 return -1;
14351
Victor Stinnera47082312012-10-04 02:19:54 +020014352
14353 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014354 case 's':
14355 case 'r':
14356 case 'a':
14357 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14358 /* Fast path */
14359 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14360 return -1;
14361 return 1;
14362 }
14363
14364 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14365 *p_str = v;
14366 Py_INCREF(*p_str);
14367 }
14368 else {
14369 if (arg->ch == 's')
14370 *p_str = PyObject_Str(v);
14371 else if (arg->ch == 'r')
14372 *p_str = PyObject_Repr(v);
14373 else
14374 *p_str = PyObject_ASCII(v);
14375 }
14376 break;
14377
14378 case 'i':
14379 case 'd':
14380 case 'u':
14381 case 'o':
14382 case 'x':
14383 case 'X':
14384 {
14385 int ret = mainformatlong(v, arg, p_str, writer);
14386 if (ret != 0)
14387 return ret;
14388 arg->sign = 1;
14389 break;
14390 }
14391
14392 case 'e':
14393 case 'E':
14394 case 'f':
14395 case 'F':
14396 case 'g':
14397 case 'G':
14398 if (arg->width == -1 && arg->prec == -1
14399 && !(arg->flags & (F_SIGN | F_BLANK)))
14400 {
14401 /* Fast path */
14402 if (formatfloat(v, arg, NULL, writer) == -1)
14403 return -1;
14404 return 1;
14405 }
14406
14407 arg->sign = 1;
14408 if (formatfloat(v, arg, p_str, NULL) == -1)
14409 return -1;
14410 break;
14411
14412 case 'c':
14413 {
14414 Py_UCS4 ch = formatchar(v);
14415 if (ch == (Py_UCS4) -1)
14416 return -1;
14417 if (arg->width == -1 && arg->prec == -1) {
14418 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014419 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014420 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014421 return 1;
14422 }
14423 *p_str = PyUnicode_FromOrdinal(ch);
14424 break;
14425 }
14426
14427 default:
14428 PyErr_Format(PyExc_ValueError,
14429 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014430 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014431 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14432 (int)arg->ch,
14433 ctx->fmtpos - 1);
14434 return -1;
14435 }
14436 if (*p_str == NULL)
14437 return -1;
14438 assert (PyUnicode_Check(*p_str));
14439 return 0;
14440}
14441
14442static int
14443unicode_format_arg_output(struct unicode_formatter_t *ctx,
14444 struct unicode_format_arg_t *arg,
14445 PyObject *str)
14446{
14447 Py_ssize_t len;
14448 enum PyUnicode_Kind kind;
14449 void *pbuf;
14450 Py_ssize_t pindex;
14451 Py_UCS4 signchar;
14452 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014453 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014454 Py_ssize_t sublen;
14455 _PyUnicodeWriter *writer = &ctx->writer;
14456 Py_UCS4 fill;
14457
14458 fill = ' ';
14459 if (arg->sign && arg->flags & F_ZERO)
14460 fill = '0';
14461
14462 if (PyUnicode_READY(str) == -1)
14463 return -1;
14464
14465 len = PyUnicode_GET_LENGTH(str);
14466 if ((arg->width == -1 || arg->width <= len)
14467 && (arg->prec == -1 || arg->prec >= len)
14468 && !(arg->flags & (F_SIGN | F_BLANK)))
14469 {
14470 /* Fast path */
14471 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14472 return -1;
14473 return 0;
14474 }
14475
14476 /* Truncate the string for "s", "r" and "a" formats
14477 if the precision is set */
14478 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14479 if (arg->prec >= 0 && len > arg->prec)
14480 len = arg->prec;
14481 }
14482
14483 /* Adjust sign and width */
14484 kind = PyUnicode_KIND(str);
14485 pbuf = PyUnicode_DATA(str);
14486 pindex = 0;
14487 signchar = '\0';
14488 if (arg->sign) {
14489 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14490 if (ch == '-' || ch == '+') {
14491 signchar = ch;
14492 len--;
14493 pindex++;
14494 }
14495 else if (arg->flags & F_SIGN)
14496 signchar = '+';
14497 else if (arg->flags & F_BLANK)
14498 signchar = ' ';
14499 else
14500 arg->sign = 0;
14501 }
14502 if (arg->width < len)
14503 arg->width = len;
14504
14505 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014506 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014507 if (!(arg->flags & F_LJUST)) {
14508 if (arg->sign) {
14509 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014510 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014511 }
14512 else {
14513 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014514 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014515 }
14516 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014517 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14518 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014519 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014520 }
14521
Victor Stinnera47082312012-10-04 02:19:54 +020014522 buflen = arg->width;
14523 if (arg->sign && len == arg->width)
14524 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014525 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014526 return -1;
14527
14528 /* Write the sign if needed */
14529 if (arg->sign) {
14530 if (fill != ' ') {
14531 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14532 writer->pos += 1;
14533 }
14534 if (arg->width > len)
14535 arg->width--;
14536 }
14537
14538 /* Write the numeric prefix for "x", "X" and "o" formats
14539 if the alternate form is used.
14540 For example, write "0x" for the "%#x" format. */
14541 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14542 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14543 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14544 if (fill != ' ') {
14545 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14546 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14547 writer->pos += 2;
14548 pindex += 2;
14549 }
14550 arg->width -= 2;
14551 if (arg->width < 0)
14552 arg->width = 0;
14553 len -= 2;
14554 }
14555
14556 /* Pad left with the fill character if needed */
14557 if (arg->width > len && !(arg->flags & F_LJUST)) {
14558 sublen = arg->width - len;
14559 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14560 writer->pos += sublen;
14561 arg->width = len;
14562 }
14563
14564 /* If padding with spaces: write sign if needed and/or numeric prefix if
14565 the alternate form is used */
14566 if (fill == ' ') {
14567 if (arg->sign) {
14568 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14569 writer->pos += 1;
14570 }
14571 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14572 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14573 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14574 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14575 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14576 writer->pos += 2;
14577 pindex += 2;
14578 }
14579 }
14580
14581 /* Write characters */
14582 if (len) {
14583 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14584 str, pindex, len);
14585 writer->pos += len;
14586 }
14587
14588 /* Pad right with the fill character if needed */
14589 if (arg->width > len) {
14590 sublen = arg->width - len;
14591 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14592 writer->pos += sublen;
14593 }
14594 return 0;
14595}
14596
14597/* Helper of PyUnicode_Format(): format one arg.
14598 Return 0 on success, raise an exception and return -1 on error. */
14599static int
14600unicode_format_arg(struct unicode_formatter_t *ctx)
14601{
14602 struct unicode_format_arg_t arg;
14603 PyObject *str;
14604 int ret;
14605
Victor Stinner8dbd4212012-12-04 09:30:24 +010014606 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14607 arg.flags = 0;
14608 arg.width = -1;
14609 arg.prec = -1;
14610 arg.sign = 0;
14611 str = NULL;
14612
Victor Stinnera47082312012-10-04 02:19:54 +020014613 ret = unicode_format_arg_parse(ctx, &arg);
14614 if (ret == -1)
14615 return -1;
14616
14617 ret = unicode_format_arg_format(ctx, &arg, &str);
14618 if (ret == -1)
14619 return -1;
14620
14621 if (ret != 1) {
14622 ret = unicode_format_arg_output(ctx, &arg, str);
14623 Py_DECREF(str);
14624 if (ret == -1)
14625 return -1;
14626 }
14627
14628 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14629 PyErr_SetString(PyExc_TypeError,
14630 "not all arguments converted during string formatting");
14631 return -1;
14632 }
14633 return 0;
14634}
14635
Alexander Belopolsky40018472011-02-26 01:02:56 +000014636PyObject *
14637PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014638{
Victor Stinnera47082312012-10-04 02:19:54 +020014639 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014640
Guido van Rossumd57fd912000-03-10 22:53:23 +000014641 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014642 PyErr_BadInternalCall();
14643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014644 }
Victor Stinnera47082312012-10-04 02:19:54 +020014645
14646 ctx.fmtstr = PyUnicode_FromObject(format);
14647 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014648 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014649 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14650 Py_DECREF(ctx.fmtstr);
14651 return NULL;
14652 }
14653 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14654 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14655 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14656 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014657
Victor Stinner8f674cc2013-04-17 23:02:17 +020014658 _PyUnicodeWriter_Init(&ctx.writer);
14659 ctx.writer.min_length = ctx.fmtcnt + 100;
14660 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014661
Guido van Rossumd57fd912000-03-10 22:53:23 +000014662 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014663 ctx.arglen = PyTuple_Size(args);
14664 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014665 }
14666 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014667 ctx.arglen = -1;
14668 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014669 }
Victor Stinnera47082312012-10-04 02:19:54 +020014670 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014671 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014672 ctx.dict = args;
14673 else
14674 ctx.dict = NULL;
14675 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014676
Victor Stinnera47082312012-10-04 02:19:54 +020014677 while (--ctx.fmtcnt >= 0) {
14678 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014679 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014680
14681 nonfmtpos = ctx.fmtpos++;
14682 while (ctx.fmtcnt >= 0 &&
14683 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14684 ctx.fmtpos++;
14685 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014686 }
Victor Stinnera47082312012-10-04 02:19:54 +020014687 if (ctx.fmtcnt < 0) {
14688 ctx.fmtpos--;
14689 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014690 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014691
Victor Stinnercfc4c132013-04-03 01:48:39 +020014692 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14693 nonfmtpos, ctx.fmtpos) < 0)
14694 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014695 }
14696 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014697 ctx.fmtpos++;
14698 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014699 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014700 }
14701 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014702
Victor Stinnera47082312012-10-04 02:19:54 +020014703 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014704 PyErr_SetString(PyExc_TypeError,
14705 "not all arguments converted during string formatting");
14706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014707 }
14708
Victor Stinnera47082312012-10-04 02:19:54 +020014709 if (ctx.args_owned) {
14710 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014711 }
Victor Stinnera47082312012-10-04 02:19:54 +020014712 Py_DECREF(ctx.fmtstr);
14713 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014714
Benjamin Peterson29060642009-01-31 22:14:21 +000014715 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014716 Py_DECREF(ctx.fmtstr);
14717 _PyUnicodeWriter_Dealloc(&ctx.writer);
14718 if (ctx.args_owned) {
14719 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014720 }
14721 return NULL;
14722}
14723
Jeremy Hylton938ace62002-07-17 16:30:39 +000014724static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014725unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14726
Tim Peters6d6c1a32001-08-02 04:15:00 +000014727static PyObject *
14728unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14729{
Benjamin Peterson29060642009-01-31 22:14:21 +000014730 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014731 static char *kwlist[] = {"object", "encoding", "errors", 0};
14732 char *encoding = NULL;
14733 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014734
Benjamin Peterson14339b62009-01-31 16:36:08 +000014735 if (type != &PyUnicode_Type)
14736 return unicode_subtype_new(type, args, kwds);
14737 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014738 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014739 return NULL;
14740 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014741 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014742 if (encoding == NULL && errors == NULL)
14743 return PyObject_Str(x);
14744 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014745 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014746}
14747
Guido van Rossume023fe02001-08-30 03:12:59 +000014748static PyObject *
14749unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14750{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014751 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014752 Py_ssize_t length, char_size;
14753 int share_wstr, share_utf8;
14754 unsigned int kind;
14755 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014756
Benjamin Peterson14339b62009-01-31 16:36:08 +000014757 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014758
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014759 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014760 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014761 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014762 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014763 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014764 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014765 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014766 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014767
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014768 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014769 if (self == NULL) {
14770 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014771 return NULL;
14772 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014773 kind = PyUnicode_KIND(unicode);
14774 length = PyUnicode_GET_LENGTH(unicode);
14775
14776 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014777#ifdef Py_DEBUG
14778 _PyUnicode_HASH(self) = -1;
14779#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014780 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014781#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014782 _PyUnicode_STATE(self).interned = 0;
14783 _PyUnicode_STATE(self).kind = kind;
14784 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014785 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014786 _PyUnicode_STATE(self).ready = 1;
14787 _PyUnicode_WSTR(self) = NULL;
14788 _PyUnicode_UTF8_LENGTH(self) = 0;
14789 _PyUnicode_UTF8(self) = NULL;
14790 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014791 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014792
14793 share_utf8 = 0;
14794 share_wstr = 0;
14795 if (kind == PyUnicode_1BYTE_KIND) {
14796 char_size = 1;
14797 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14798 share_utf8 = 1;
14799 }
14800 else if (kind == PyUnicode_2BYTE_KIND) {
14801 char_size = 2;
14802 if (sizeof(wchar_t) == 2)
14803 share_wstr = 1;
14804 }
14805 else {
14806 assert(kind == PyUnicode_4BYTE_KIND);
14807 char_size = 4;
14808 if (sizeof(wchar_t) == 4)
14809 share_wstr = 1;
14810 }
14811
14812 /* Ensure we won't overflow the length. */
14813 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14814 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014815 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014816 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014817 data = PyObject_MALLOC((length + 1) * char_size);
14818 if (data == NULL) {
14819 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014820 goto onError;
14821 }
14822
Victor Stinnerc3c74152011-10-02 20:39:55 +020014823 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014824 if (share_utf8) {
14825 _PyUnicode_UTF8_LENGTH(self) = length;
14826 _PyUnicode_UTF8(self) = data;
14827 }
14828 if (share_wstr) {
14829 _PyUnicode_WSTR_LENGTH(self) = length;
14830 _PyUnicode_WSTR(self) = (wchar_t *)data;
14831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014832
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014833 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014834 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014835 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014836#ifdef Py_DEBUG
14837 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14838#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014839 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014840 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014841
14842onError:
14843 Py_DECREF(unicode);
14844 Py_DECREF(self);
14845 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014846}
14847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014848PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014849"str(object='') -> str\n\
14850str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014851\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014852Create a new string object from the given object. If encoding or\n\
14853errors is specified, then the object must expose a data buffer\n\
14854that will be decoded using the given encoding and error handler.\n\
14855Otherwise, returns the result of object.__str__() (if defined)\n\
14856or repr(object).\n\
14857encoding defaults to sys.getdefaultencoding().\n\
14858errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014859
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014860static PyObject *unicode_iter(PyObject *seq);
14861
Guido van Rossumd57fd912000-03-10 22:53:23 +000014862PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014863 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014864 "str", /* tp_name */
14865 sizeof(PyUnicodeObject), /* tp_size */
14866 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014867 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014868 (destructor)unicode_dealloc, /* tp_dealloc */
14869 0, /* tp_print */
14870 0, /* tp_getattr */
14871 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014872 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014873 unicode_repr, /* tp_repr */
14874 &unicode_as_number, /* tp_as_number */
14875 &unicode_as_sequence, /* tp_as_sequence */
14876 &unicode_as_mapping, /* tp_as_mapping */
14877 (hashfunc) unicode_hash, /* tp_hash*/
14878 0, /* tp_call*/
14879 (reprfunc) unicode_str, /* tp_str */
14880 PyObject_GenericGetAttr, /* tp_getattro */
14881 0, /* tp_setattro */
14882 0, /* tp_as_buffer */
14883 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014884 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014885 unicode_doc, /* tp_doc */
14886 0, /* tp_traverse */
14887 0, /* tp_clear */
14888 PyUnicode_RichCompare, /* tp_richcompare */
14889 0, /* tp_weaklistoffset */
14890 unicode_iter, /* tp_iter */
14891 0, /* tp_iternext */
14892 unicode_methods, /* tp_methods */
14893 0, /* tp_members */
14894 0, /* tp_getset */
14895 &PyBaseObject_Type, /* tp_base */
14896 0, /* tp_dict */
14897 0, /* tp_descr_get */
14898 0, /* tp_descr_set */
14899 0, /* tp_dictoffset */
14900 0, /* tp_init */
14901 0, /* tp_alloc */
14902 unicode_new, /* tp_new */
14903 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014904};
14905
14906/* Initialize the Unicode implementation */
14907
Victor Stinner3a50e702011-10-18 21:21:00 +020014908int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014909{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014910 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014911 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014912 0x000A, /* LINE FEED */
14913 0x000D, /* CARRIAGE RETURN */
14914 0x001C, /* FILE SEPARATOR */
14915 0x001D, /* GROUP SEPARATOR */
14916 0x001E, /* RECORD SEPARATOR */
14917 0x0085, /* NEXT LINE */
14918 0x2028, /* LINE SEPARATOR */
14919 0x2029, /* PARAGRAPH SEPARATOR */
14920 };
14921
Fred Drakee4315f52000-05-09 19:53:39 +000014922 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014923 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014924 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014925 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014926 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014927
Guido van Rossumcacfc072002-05-24 19:01:59 +000014928 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014929 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014930
14931 /* initialize the linebreak bloom filter */
14932 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014933 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014934 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014935
Christian Heimes26532f72013-07-20 14:57:16 +020014936 if (PyType_Ready(&EncodingMapType) < 0)
14937 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014938
Benjamin Petersonc4311282012-10-30 23:21:10 -040014939 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14940 Py_FatalError("Can't initialize field name iterator type");
14941
14942 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14943 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014944
Victor Stinner3a50e702011-10-18 21:21:00 +020014945 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946}
14947
14948/* Finalize the Unicode implementation */
14949
Christian Heimesa156e092008-02-16 07:38:31 +000014950int
14951PyUnicode_ClearFreeList(void)
14952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014953 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014954}
14955
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956void
Thomas Wouters78890102000-07-22 19:25:51 +000014957_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014958{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014959 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014960
Serhiy Storchaka05997252013-01-26 12:14:02 +020014961 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014962
Serhiy Storchaka05997252013-01-26 12:14:02 +020014963 for (i = 0; i < 256; i++)
14964 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014965 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014966 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014967}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014968
Walter Dörwald16807132007-05-25 13:52:07 +000014969void
14970PyUnicode_InternInPlace(PyObject **p)
14971{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014972 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014973 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014974#ifdef Py_DEBUG
14975 assert(s != NULL);
14976 assert(_PyUnicode_CHECK(s));
14977#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014978 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014979 return;
14980#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014981 /* If it's a subclass, we don't really know what putting
14982 it in the interned dict might do. */
14983 if (!PyUnicode_CheckExact(s))
14984 return;
14985 if (PyUnicode_CHECK_INTERNED(s))
14986 return;
14987 if (interned == NULL) {
14988 interned = PyDict_New();
14989 if (interned == NULL) {
14990 PyErr_Clear(); /* Don't leave an exception */
14991 return;
14992 }
14993 }
14994 /* It might be that the GetItem call fails even
14995 though the key is present in the dictionary,
14996 namely when this happens during a stack overflow. */
14997 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014998 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015000
Victor Stinnerf0335102013-04-14 19:13:03 +020015001 if (t) {
15002 Py_INCREF(t);
15003 Py_DECREF(*p);
15004 *p = t;
15005 return;
15006 }
Walter Dörwald16807132007-05-25 13:52:07 +000015007
Benjamin Peterson14339b62009-01-31 16:36:08 +000015008 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015009 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015010 PyErr_Clear();
15011 PyThreadState_GET()->recursion_critical = 0;
15012 return;
15013 }
15014 PyThreadState_GET()->recursion_critical = 0;
15015 /* The two references in interned are not counted by refcnt.
15016 The deallocator will take care of this */
15017 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015018 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015019}
15020
15021void
15022PyUnicode_InternImmortal(PyObject **p)
15023{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015024 PyUnicode_InternInPlace(p);
15025 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015026 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015027 Py_INCREF(*p);
15028 }
Walter Dörwald16807132007-05-25 13:52:07 +000015029}
15030
15031PyObject *
15032PyUnicode_InternFromString(const char *cp)
15033{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015034 PyObject *s = PyUnicode_FromString(cp);
15035 if (s == NULL)
15036 return NULL;
15037 PyUnicode_InternInPlace(&s);
15038 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015039}
15040
Alexander Belopolsky40018472011-02-26 01:02:56 +000015041void
15042_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015043{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015045 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 Py_ssize_t i, n;
15047 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015048
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 if (interned == NULL || !PyDict_Check(interned))
15050 return;
15051 keys = PyDict_Keys(interned);
15052 if (keys == NULL || !PyList_Check(keys)) {
15053 PyErr_Clear();
15054 return;
15055 }
Walter Dörwald16807132007-05-25 13:52:07 +000015056
Benjamin Peterson14339b62009-01-31 16:36:08 +000015057 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15058 detector, interned unicode strings are not forcibly deallocated;
15059 rather, we give them their stolen references back, and then clear
15060 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015061
Benjamin Peterson14339b62009-01-31 16:36:08 +000015062 n = PyList_GET_SIZE(keys);
15063 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015064 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015066 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015067 if (PyUnicode_READY(s) == -1) {
15068 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015069 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015071 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015072 case SSTATE_NOT_INTERNED:
15073 /* XXX Shouldn't happen */
15074 break;
15075 case SSTATE_INTERNED_IMMORTAL:
15076 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015077 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 break;
15079 case SSTATE_INTERNED_MORTAL:
15080 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015081 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 break;
15083 default:
15084 Py_FatalError("Inconsistent interned string state.");
15085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015086 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 }
15088 fprintf(stderr, "total size of all interned strings: "
15089 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15090 "mortal/immortal\n", mortal_size, immortal_size);
15091 Py_DECREF(keys);
15092 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015093 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015094}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015095
15096
15097/********************* Unicode Iterator **************************/
15098
15099typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 PyObject_HEAD
15101 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015102 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015103} unicodeiterobject;
15104
15105static void
15106unicodeiter_dealloc(unicodeiterobject *it)
15107{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015108 _PyObject_GC_UNTRACK(it);
15109 Py_XDECREF(it->it_seq);
15110 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015111}
15112
15113static int
15114unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 Py_VISIT(it->it_seq);
15117 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015118}
15119
15120static PyObject *
15121unicodeiter_next(unicodeiterobject *it)
15122{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015123 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015124
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 assert(it != NULL);
15126 seq = it->it_seq;
15127 if (seq == NULL)
15128 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015129 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015131 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15132 int kind = PyUnicode_KIND(seq);
15133 void *data = PyUnicode_DATA(seq);
15134 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15135 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 if (item != NULL)
15137 ++it->it_index;
15138 return item;
15139 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015140
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 Py_DECREF(seq);
15142 it->it_seq = NULL;
15143 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015144}
15145
15146static PyObject *
15147unicodeiter_len(unicodeiterobject *it)
15148{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 Py_ssize_t len = 0;
15150 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015151 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015153}
15154
15155PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15156
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015157static PyObject *
15158unicodeiter_reduce(unicodeiterobject *it)
15159{
15160 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015161 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015162 it->it_seq, it->it_index);
15163 } else {
15164 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15165 if (u == NULL)
15166 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015167 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015168 }
15169}
15170
15171PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15172
15173static PyObject *
15174unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15175{
15176 Py_ssize_t index = PyLong_AsSsize_t(state);
15177 if (index == -1 && PyErr_Occurred())
15178 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015179 if (it->it_seq != NULL) {
15180 if (index < 0)
15181 index = 0;
15182 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15183 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15184 it->it_index = index;
15185 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015186 Py_RETURN_NONE;
15187}
15188
15189PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15190
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015191static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015192 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015193 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015194 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15195 reduce_doc},
15196 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15197 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015198 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015199};
15200
15201PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15203 "str_iterator", /* tp_name */
15204 sizeof(unicodeiterobject), /* tp_basicsize */
15205 0, /* tp_itemsize */
15206 /* methods */
15207 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15208 0, /* tp_print */
15209 0, /* tp_getattr */
15210 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015211 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015212 0, /* tp_repr */
15213 0, /* tp_as_number */
15214 0, /* tp_as_sequence */
15215 0, /* tp_as_mapping */
15216 0, /* tp_hash */
15217 0, /* tp_call */
15218 0, /* tp_str */
15219 PyObject_GenericGetAttr, /* tp_getattro */
15220 0, /* tp_setattro */
15221 0, /* tp_as_buffer */
15222 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15223 0, /* tp_doc */
15224 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15225 0, /* tp_clear */
15226 0, /* tp_richcompare */
15227 0, /* tp_weaklistoffset */
15228 PyObject_SelfIter, /* tp_iter */
15229 (iternextfunc)unicodeiter_next, /* tp_iternext */
15230 unicodeiter_methods, /* tp_methods */
15231 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015232};
15233
15234static PyObject *
15235unicode_iter(PyObject *seq)
15236{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015237 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015238
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 if (!PyUnicode_Check(seq)) {
15240 PyErr_BadInternalCall();
15241 return NULL;
15242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015243 if (PyUnicode_READY(seq) == -1)
15244 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15246 if (it == NULL)
15247 return NULL;
15248 it->it_index = 0;
15249 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015250 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 _PyObject_GC_TRACK(it);
15252 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015253}
15254
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015255
15256size_t
15257Py_UNICODE_strlen(const Py_UNICODE *u)
15258{
15259 int res = 0;
15260 while(*u++)
15261 res++;
15262 return res;
15263}
15264
15265Py_UNICODE*
15266Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15267{
15268 Py_UNICODE *u = s1;
15269 while ((*u++ = *s2++));
15270 return s1;
15271}
15272
15273Py_UNICODE*
15274Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15275{
15276 Py_UNICODE *u = s1;
15277 while ((*u++ = *s2++))
15278 if (n-- == 0)
15279 break;
15280 return s1;
15281}
15282
15283Py_UNICODE*
15284Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15285{
15286 Py_UNICODE *u1 = s1;
15287 u1 += Py_UNICODE_strlen(u1);
15288 Py_UNICODE_strcpy(u1, s2);
15289 return s1;
15290}
15291
15292int
15293Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15294{
15295 while (*s1 && *s2 && *s1 == *s2)
15296 s1++, s2++;
15297 if (*s1 && *s2)
15298 return (*s1 < *s2) ? -1 : +1;
15299 if (*s1)
15300 return 1;
15301 if (*s2)
15302 return -1;
15303 return 0;
15304}
15305
15306int
15307Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15308{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015309 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015310 for (; n != 0; n--) {
15311 u1 = *s1;
15312 u2 = *s2;
15313 if (u1 != u2)
15314 return (u1 < u2) ? -1 : +1;
15315 if (u1 == '\0')
15316 return 0;
15317 s1++;
15318 s2++;
15319 }
15320 return 0;
15321}
15322
15323Py_UNICODE*
15324Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15325{
15326 const Py_UNICODE *p;
15327 for (p = s; *p; p++)
15328 if (*p == c)
15329 return (Py_UNICODE*)p;
15330 return NULL;
15331}
15332
15333Py_UNICODE*
15334Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15335{
15336 const Py_UNICODE *p;
15337 p = s + Py_UNICODE_strlen(s);
15338 while (p != s) {
15339 p--;
15340 if (*p == c)
15341 return (Py_UNICODE*)p;
15342 }
15343 return NULL;
15344}
Victor Stinner331ea922010-08-10 16:37:20 +000015345
Victor Stinner71133ff2010-09-01 23:43:53 +000015346Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015347PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015348{
Victor Stinner577db2c2011-10-11 22:12:48 +020015349 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015350 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015352 if (!PyUnicode_Check(unicode)) {
15353 PyErr_BadArgument();
15354 return NULL;
15355 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015356 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015357 if (u == NULL)
15358 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015359 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015360 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015361 PyErr_NoMemory();
15362 return NULL;
15363 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015364 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015365 size *= sizeof(Py_UNICODE);
15366 copy = PyMem_Malloc(size);
15367 if (copy == NULL) {
15368 PyErr_NoMemory();
15369 return NULL;
15370 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015371 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015372 return copy;
15373}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015374
Georg Brandl66c221e2010-10-14 07:04:07 +000015375/* A _string module, to export formatter_parser and formatter_field_name_split
15376 to the string.Formatter class implemented in Python. */
15377
15378static PyMethodDef _string_methods[] = {
15379 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15380 METH_O, PyDoc_STR("split the argument as a field name")},
15381 {"formatter_parser", (PyCFunction) formatter_parser,
15382 METH_O, PyDoc_STR("parse the argument as a format string")},
15383 {NULL, NULL}
15384};
15385
15386static struct PyModuleDef _string_module = {
15387 PyModuleDef_HEAD_INIT,
15388 "_string",
15389 PyDoc_STR("string helper module"),
15390 0,
15391 _string_methods,
15392 NULL,
15393 NULL,
15394 NULL,
15395 NULL
15396};
15397
15398PyMODINIT_FUNC
15399PyInit__string(void)
15400{
15401 return PyModule_Create(&_string_module);
15402}
15403
15404
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015405#ifdef __cplusplus
15406}
15407#endif