blob: 9611ed41f7fe81a51adf0b795852c224eee9bfe5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
678/* Fill the data of an Unicode string with invalid characters to detect bugs
679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Victor Stinner84def372011-12-11 20:04:56 +0100725 _Py_DEC_REFTOTAL;
726 _Py_ForgetReference(unicode);
727
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300728 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100729 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100730 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 PyErr_NoMemory();
732 return NULL;
733 }
Victor Stinner84def372011-12-11 20:04:56 +0100734 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100736
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100740 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_WSTR_LENGTH(unicode) = length;
742 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100743 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
744 PyObject_DEL(_PyUnicode_WSTR(unicode));
745 _PyUnicode_WSTR(unicode) = NULL;
746 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200747#ifdef Py_DEBUG
748 unicode_fill_invalid(unicode, old_length);
749#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
751 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200752 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753 return unicode;
754}
755
Alexander Belopolsky40018472011-02-26 01:02:56 +0000756static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200757resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
Victor Stinner95663112011-10-04 01:03:50 +0200759 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100760 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200761 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200762 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 if (PyUnicode_IS_READY(unicode)) {
765 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200766 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
770#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771
772 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200773 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200774 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
775 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776
777 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
778 PyErr_NoMemory();
779 return -1;
780 }
781 new_size = (length + 1) * char_size;
782
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
784 {
785 PyObject_DEL(_PyUnicode_UTF8(unicode));
786 _PyUnicode_UTF8(unicode) = NULL;
787 _PyUnicode_UTF8_LENGTH(unicode) = 0;
788 }
789
Victor Stinnerfe226c02011-10-03 03:52:20 +0200790 data = (PyObject *)PyObject_REALLOC(data, new_size);
791 if (data == NULL) {
792 PyErr_NoMemory();
793 return -1;
794 }
795 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200796 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 _PyUnicode_WSTR_LENGTH(unicode) = length;
799 }
800 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200801 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200802 _PyUnicode_UTF8_LENGTH(unicode) = length;
803 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_LENGTH(unicode) = length;
805 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200806#ifdef Py_DEBUG
807 unicode_fill_invalid(unicode, old_length);
808#endif
Victor Stinner95663112011-10-04 01:03:50 +0200809 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200810 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
Victor Stinner95663112011-10-04 01:03:50 +0200814 assert(_PyUnicode_WSTR(unicode) != NULL);
815
816 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700817 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200818 PyErr_NoMemory();
819 return -1;
820 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100821 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200822 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200824 if (!wstr) {
825 PyErr_NoMemory();
826 return -1;
827 }
828 _PyUnicode_WSTR(unicode) = wstr;
829 _PyUnicode_WSTR(unicode)[length] = 0;
830 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200831 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832 return 0;
833}
834
Victor Stinnerfe226c02011-10-03 03:52:20 +0200835static PyObject*
836resize_copy(PyObject *unicode, Py_ssize_t length)
837{
838 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100839 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200840 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841
Benjamin Petersonbac79492012-01-14 13:34:47 -0500842 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200844
845 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
846 if (copy == NULL)
847 return NULL;
848
849 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200850 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200852 }
853 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200854 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100855
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200857 if (w == NULL)
858 return NULL;
859 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
860 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200861 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
862 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 }
865}
866
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000868 Ux0000 terminated; some code (e.g. new_identifier)
869 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000872 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
874*/
875
Alexander Belopolsky40018472011-02-26 01:02:56 +0000876static PyUnicodeObject *
877_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200879 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 if (length == 0 && unicode_empty != NULL) {
884 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200885 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886 }
887
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000888 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700889 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 return (PyUnicodeObject *)PyErr_NoMemory();
891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 if (length < 0) {
893 PyErr_SetString(PyExc_SystemError,
894 "Negative size passed to _PyUnicode_New");
895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896 }
897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
899 if (unicode == NULL)
900 return NULL;
901 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100902
903 _PyUnicode_WSTR_LENGTH(unicode) = length;
904 _PyUnicode_HASH(unicode) = -1;
905 _PyUnicode_STATE(unicode).interned = 0;
906 _PyUnicode_STATE(unicode).kind = 0;
907 _PyUnicode_STATE(unicode).compact = 0;
908 _PyUnicode_STATE(unicode).ready = 0;
909 _PyUnicode_STATE(unicode).ascii = 0;
910 _PyUnicode_DATA_ANY(unicode) = NULL;
911 _PyUnicode_LENGTH(unicode) = 0;
912 _PyUnicode_UTF8(unicode) = NULL;
913 _PyUnicode_UTF8_LENGTH(unicode) = 0;
914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
916 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921
Jeremy Hyltond8082792003-09-16 19:41:39 +0000922 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000923 * the caller fails before initializing str -- unicode_resize()
924 * reads str[0], and the Keep-Alive optimization can keep memory
925 * allocated for str alive across a call to unicode_dealloc(unicode).
926 * We don't want unicode_resize to read uninitialized memory in
927 * that case.
928 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 _PyUnicode_WSTR(unicode)[0] = 0;
930 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100931
Victor Stinner7931d9a2011-11-04 00:22:48 +0100932 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933 return unicode;
934}
935
Victor Stinnerf42dc442011-10-02 23:33:16 +0200936static const char*
937unicode_kind_name(PyObject *unicode)
938{
Victor Stinner42dfd712011-10-03 14:41:45 +0200939 /* don't check consistency: unicode_kind_name() is called from
940 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200941 if (!PyUnicode_IS_COMPACT(unicode))
942 {
943 if (!PyUnicode_IS_READY(unicode))
944 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600945 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 {
947 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200948 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200949 return "legacy ascii";
950 else
951 return "legacy latin1";
952 case PyUnicode_2BYTE_KIND:
953 return "legacy UCS2";
954 case PyUnicode_4BYTE_KIND:
955 return "legacy UCS4";
956 default:
957 return "<legacy invalid kind>";
958 }
959 }
960 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600961 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 return "ascii";
965 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200966 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200967 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 default:
972 return "<invalid compact kind>";
973 }
974}
975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977/* Functions wrapping macros for use in debugger */
978char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200979 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980}
981
982void *_PyUnicode_compact_data(void *unicode) {
983 return _PyUnicode_COMPACT_DATA(unicode);
984}
985void *_PyUnicode_data(void *unicode){
986 printf("obj %p\n", unicode);
987 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
988 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
989 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
990 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
991 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
992 return PyUnicode_DATA(unicode);
993}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200994
995void
996_PyUnicode_Dump(PyObject *op)
997{
998 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200999 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1000 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1001 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001002
Victor Stinnera849a4b2011-10-03 12:12:11 +02001003 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001004 {
1005 if (ascii->state.ascii)
1006 data = (ascii + 1);
1007 else
1008 data = (compact + 1);
1009 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 else
1011 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001012 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1013 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 if (ascii->wstr == data)
1016 printf("shared ");
1017 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001018
Victor Stinnera3b334d2011-10-03 13:53:37 +02001019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001020 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001023 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1024 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001536 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1537 PyErr_NoMemory();
1538 return -1;
1539 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001540 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1541 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyErr_NoMemory();
1543 return -1;
1544 }
1545 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1546 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001547 _PyUnicode_UTF8(unicode) = NULL;
1548 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001549 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1550 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001551 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552 PyObject_FREE(_PyUnicode_WSTR(unicode));
1553 _PyUnicode_WSTR(unicode) = NULL;
1554 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1555#else
1556 assert(num_surrogates == 0);
1557
Victor Stinnerc3c74152011-10-02 20:39:55 +02001558 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001560 _PyUnicode_UTF8(unicode) = NULL;
1561 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1563#endif
1564 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1565 }
1566 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001567 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 return 0;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001572unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573{
Walter Dörwald16807132007-05-25 13:52:07 +00001574 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 case SSTATE_NOT_INTERNED:
1576 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 case SSTATE_INTERNED_MORTAL:
1579 /* revive dead object temporarily for DelItem */
1580 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001581 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 Py_FatalError(
1583 "deletion of interned string failed");
1584 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001585
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 case SSTATE_INTERNED_IMMORTAL:
1587 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001588
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 default:
1590 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001591 }
1592
Victor Stinner03490912011-10-03 23:45:12 +02001593 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001595 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001596 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001597 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1598 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001600 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601}
1602
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001603#ifdef Py_DEBUG
1604static int
1605unicode_is_singleton(PyObject *unicode)
1606{
1607 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1608 if (unicode == unicode_empty)
1609 return 1;
1610 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1611 {
1612 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1613 if (ch < 256 && unicode_latin1[ch] == unicode)
1614 return 1;
1615 }
1616 return 0;
1617}
1618#endif
1619
Alexander Belopolsky40018472011-02-26 01:02:56 +00001620static int
Victor Stinner488fa492011-12-12 00:01:39 +01001621unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001622{
Victor Stinner488fa492011-12-12 00:01:39 +01001623 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (Py_REFCNT(unicode) != 1)
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (_PyUnicode_HASH(unicode) != -1)
1627 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_CHECK_INTERNED(unicode))
1629 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001630 if (!PyUnicode_CheckExact(unicode))
1631 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001632#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001633 /* singleton refcount is greater than 1 */
1634 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001635#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636 return 1;
1637}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001638
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639static int
1640unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1641{
1642 PyObject *unicode;
1643 Py_ssize_t old_length;
1644
1645 assert(p_unicode != NULL);
1646 unicode = *p_unicode;
1647
1648 assert(unicode != NULL);
1649 assert(PyUnicode_Check(unicode));
1650 assert(0 <= length);
1651
Victor Stinner910337b2011-10-03 03:20:16 +02001652 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001653 old_length = PyUnicode_WSTR_LENGTH(unicode);
1654 else
1655 old_length = PyUnicode_GET_LENGTH(unicode);
1656 if (old_length == length)
1657 return 0;
1658
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001660 _Py_INCREF_UNICODE_EMPTY();
1661 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 Py_DECREF(*p_unicode);
1664 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001665 return 0;
1666 }
1667
Victor Stinner488fa492011-12-12 00:01:39 +01001668 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001669 PyObject *copy = resize_copy(unicode, length);
1670 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001671 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 Py_DECREF(*p_unicode);
1673 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001674 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001675 }
1676
Victor Stinnerfe226c02011-10-03 03:52:20 +02001677 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001678 PyObject *new_unicode = resize_compact(unicode, length);
1679 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001681 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001683 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001684 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001685}
1686
Alexander Belopolsky40018472011-02-26 01:02:56 +00001687int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001689{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001690 PyObject *unicode;
1691 if (p_unicode == NULL) {
1692 PyErr_BadInternalCall();
1693 return -1;
1694 }
1695 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001696 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001697 {
1698 PyErr_BadInternalCall();
1699 return -1;
1700 }
1701 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001702}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001703
Victor Stinnerc5166102012-02-22 13:55:02 +01001704/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001705
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001706 WARNING: The function doesn't copy the terminating null character and
1707 doesn't check the maximum character (may write a latin1 character in an
1708 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001709static void
1710unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1711 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001712{
1713 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1714 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001715 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001716
1717 switch (kind) {
1718 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001719 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001720#ifdef Py_DEBUG
1721 if (PyUnicode_IS_ASCII(unicode)) {
1722 Py_UCS4 maxchar = ucs1lib_find_max_char(
1723 (const Py_UCS1*)str,
1724 (const Py_UCS1*)str + len);
1725 assert(maxchar < 128);
1726 }
1727#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001728 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001729 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001730 }
1731 case PyUnicode_2BYTE_KIND: {
1732 Py_UCS2 *start = (Py_UCS2 *)data + index;
1733 Py_UCS2 *ucs2 = start;
1734 assert(index <= PyUnicode_GET_LENGTH(unicode));
1735
Victor Stinner184252a2012-06-16 02:57:41 +02001736 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 *ucs2 = (Py_UCS2)*str;
1738
1739 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001740 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001741 }
1742 default: {
1743 Py_UCS4 *start = (Py_UCS4 *)data + index;
1744 Py_UCS4 *ucs4 = start;
1745 assert(kind == PyUnicode_4BYTE_KIND);
1746 assert(index <= PyUnicode_GET_LENGTH(unicode));
1747
Victor Stinner184252a2012-06-16 02:57:41 +02001748 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001749 *ucs4 = (Py_UCS4)*str;
1750
1751 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001752 }
1753 }
1754}
1755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756static PyObject*
1757get_latin1_char(unsigned char ch)
1758{
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 if (!unicode)
1763 return NULL;
1764 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001765 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 unicode_latin1[ch] = unicode;
1767 }
1768 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001769 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770}
1771
Victor Stinner985a82a2014-01-03 12:53:47 +01001772static PyObject*
1773unicode_char(Py_UCS4 ch)
1774{
1775 PyObject *unicode;
1776
1777 assert(ch <= MAX_UNICODE);
1778
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001779 if (ch < 256)
1780 return get_latin1_char(ch);
1781
Victor Stinner985a82a2014-01-03 12:53:47 +01001782 unicode = PyUnicode_New(1, ch);
1783 if (unicode == NULL)
1784 return NULL;
1785 switch (PyUnicode_KIND(unicode)) {
1786 case PyUnicode_1BYTE_KIND:
1787 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1788 break;
1789 case PyUnicode_2BYTE_KIND:
1790 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1791 break;
1792 default:
1793 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1794 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1795 }
1796 assert(_PyUnicode_CheckConsistency(unicode, 1));
1797 return unicode;
1798}
1799
Alexander Belopolsky40018472011-02-26 01:02:56 +00001800PyObject *
1801PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001803 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 Py_UCS4 maxchar = 0;
1805 Py_ssize_t num_surrogates;
1806
1807 if (u == NULL)
1808 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810 /* If the Unicode data is known at construction time, we can apply
1811 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001814 if (size == 0)
1815 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 /* Single character Unicode objects in the Latin-1 range are
1818 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001819 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 return get_latin1_char((unsigned char)*u);
1821
1822 /* If not empty and not single character, copy the Unicode data
1823 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001824 if (find_maxchar_surrogates(u, u + size,
1825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return NULL;
1827
Victor Stinner8faf8212011-12-08 22:14:11 +01001828 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 if (!unicode)
1830 return NULL;
1831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 switch (PyUnicode_KIND(unicode)) {
1833 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1836 break;
1837 case PyUnicode_2BYTE_KIND:
1838#if Py_UNICODE_SIZE == 2
1839 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1840#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001841 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1843#endif
1844 break;
1845 case PyUnicode_4BYTE_KIND:
1846#if SIZEOF_WCHAR_T == 2
1847 /* This is the only case which has to process surrogates, thus
1848 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001849 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850#else
1851 assert(num_surrogates == 0);
1852 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1853#endif
1854 break;
1855 default:
1856 assert(0 && "Impossible state");
1857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001859 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860}
1861
Alexander Belopolsky40018472011-02-26 01:02:56 +00001862PyObject *
1863PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001864{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 if (size < 0) {
1866 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 return NULL;
1869 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001870 if (u != NULL)
1871 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1872 else
1873 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001874}
1875
Alexander Belopolsky40018472011-02-26 01:02:56 +00001876PyObject *
1877PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001878{
1879 size_t size = strlen(u);
1880 if (size > PY_SSIZE_T_MAX) {
1881 PyErr_SetString(PyExc_OverflowError, "input too long");
1882 return NULL;
1883 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001884 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001885}
1886
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001887PyObject *
1888_PyUnicode_FromId(_Py_Identifier *id)
1889{
1890 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001891 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1892 strlen(id->string),
1893 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001894 if (!id->object)
1895 return NULL;
1896 PyUnicode_InternInPlace(&id->object);
1897 assert(!id->next);
1898 id->next = static_strings;
1899 static_strings = id;
1900 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001901 return id->object;
1902}
1903
1904void
1905_PyUnicode_ClearStaticStrings()
1906{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001907 _Py_Identifier *tmp, *s = static_strings;
1908 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001909 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 tmp = s->next;
1911 s->next = NULL;
1912 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001914 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001915}
1916
Benjamin Peterson0df54292012-03-26 14:50:32 -04001917/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918
Victor Stinnerd3f08822012-05-29 12:57:52 +02001919PyObject*
1920_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001921{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001922 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001923 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001924 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001926 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001927#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001928 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001929 }
Victor Stinner785938e2011-12-11 20:09:03 +01001930 unicode = PyUnicode_New(size, 127);
1931 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001932 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001933 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1934 assert(_PyUnicode_CheckConsistency(unicode, 1));
1935 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001936}
1937
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938static Py_UCS4
1939kind_maxchar_limit(unsigned int kind)
1940{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001941 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001942 case PyUnicode_1BYTE_KIND:
1943 return 0x80;
1944 case PyUnicode_2BYTE_KIND:
1945 return 0x100;
1946 case PyUnicode_4BYTE_KIND:
1947 return 0x10000;
1948 default:
1949 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001950 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001951 }
1952}
1953
Victor Stinnere6abb482012-05-02 01:15:40 +02001954Py_LOCAL_INLINE(Py_UCS4)
1955align_maxchar(Py_UCS4 maxchar)
1956{
1957 if (maxchar <= 127)
1958 return 127;
1959 else if (maxchar <= 255)
1960 return 255;
1961 else if (maxchar <= 65535)
1962 return 65535;
1963 else
1964 return MAX_UNICODE;
1965}
1966
Victor Stinner702c7342011-10-05 13:50:52 +02001967static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001968_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001972
Serhiy Storchaka678db842013-01-26 12:16:36 +02001973 if (size == 0)
1974 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001976 if (size == 1)
1977 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
1983 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001984 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001986}
1987
Victor Stinnere57b1c02011-09-28 22:20:48 +02001988static PyObject*
1989_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990{
1991 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001993
Serhiy Storchaka678db842013-01-26 12:16:36 +02001994 if (size == 0)
1995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001996 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 if (size == 1)
1998 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002000 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 if (!res)
2003 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002004 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002006 else {
2007 _PyUnicode_CONVERT_BYTES(
2008 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2009 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002010 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 return res;
2012}
2013
Victor Stinnere57b1c02011-09-28 22:20:48 +02002014static PyObject*
2015_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016{
2017 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002019
Serhiy Storchaka678db842013-01-26 12:16:36 +02002020 if (size == 0)
2021 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 if (size == 1)
2024 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002025
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002026 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002027 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 if (!res)
2029 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002030 if (max_char < 256)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2032 PyUnicode_1BYTE_DATA(res));
2033 else if (max_char < 0x10000)
2034 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2035 PyUnicode_2BYTE_DATA(res));
2036 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002038 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 return res;
2040}
2041
2042PyObject*
2043PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2044{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002045 if (size < 0) {
2046 PyErr_SetString(PyExc_ValueError, "size must be positive");
2047 return NULL;
2048 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002049 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002055 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002056 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002057 PyErr_SetString(PyExc_SystemError, "invalid kind");
2058 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060}
2061
Victor Stinnerece58de2012-04-23 23:36:38 +02002062Py_UCS4
2063_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2064{
2065 enum PyUnicode_Kind kind;
2066 void *startptr, *endptr;
2067
2068 assert(PyUnicode_IS_READY(unicode));
2069 assert(0 <= start);
2070 assert(end <= PyUnicode_GET_LENGTH(unicode));
2071 assert(start <= end);
2072
2073 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2074 return PyUnicode_MAX_CHAR_VALUE(unicode);
2075
2076 if (start == end)
2077 return 127;
2078
Victor Stinner94d558b2012-04-27 22:26:58 +02002079 if (PyUnicode_IS_ASCII(unicode))
2080 return 127;
2081
Victor Stinnerece58de2012-04-23 23:36:38 +02002082 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002083 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002084 endptr = (char *)startptr + end * kind;
2085 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002086 switch(kind) {
2087 case PyUnicode_1BYTE_KIND:
2088 return ucs1lib_find_max_char(startptr, endptr);
2089 case PyUnicode_2BYTE_KIND:
2090 return ucs2lib_find_max_char(startptr, endptr);
2091 case PyUnicode_4BYTE_KIND:
2092 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002094 assert(0);
2095 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002096 }
2097}
2098
Victor Stinner25a4b292011-10-06 12:31:55 +02002099/* Ensure that a string uses the most efficient storage, if it is not the
2100 case: create a new string with of the right kind. Write NULL into *p_unicode
2101 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002102static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002103unicode_adjust_maxchar(PyObject **p_unicode)
2104{
2105 PyObject *unicode, *copy;
2106 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002107 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002108 unsigned int kind;
2109
2110 assert(p_unicode != NULL);
2111 unicode = *p_unicode;
2112 assert(PyUnicode_IS_READY(unicode));
2113 if (PyUnicode_IS_ASCII(unicode))
2114 return;
2115
2116 len = PyUnicode_GET_LENGTH(unicode);
2117 kind = PyUnicode_KIND(unicode);
2118 if (kind == PyUnicode_1BYTE_KIND) {
2119 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002120 max_char = ucs1lib_find_max_char(u, u + len);
2121 if (max_char >= 128)
2122 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002123 }
2124 else if (kind == PyUnicode_2BYTE_KIND) {
2125 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs2lib_find_max_char(u, u + len);
2127 if (max_char >= 256)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
2130 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002133 max_char = ucs4lib_find_max_char(u, u + len);
2134 if (max_char >= 0x10000)
2135 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002138 if (copy != NULL)
2139 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002140 Py_DECREF(unicode);
2141 *p_unicode = copy;
2142}
2143
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002145_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146{
Victor Stinner87af4f22011-11-21 23:03:47 +01002147 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner034f6cf2011-09-30 02:26:44 +02002150 if (!PyUnicode_Check(unicode)) {
2151 PyErr_BadInternalCall();
2152 return NULL;
2153 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002154 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156
Victor Stinner87af4f22011-11-21 23:03:47 +01002157 length = PyUnicode_GET_LENGTH(unicode);
2158 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 if (!copy)
2160 return NULL;
2161 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2162
Victor Stinner87af4f22011-11-21 23:03:47 +01002163 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2164 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002165 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002166 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002167}
2168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
Victor Stinnerbc603d12011-10-02 01:00:40 +02002170/* Widen Unicode objects to larger buffers. Don't write terminating null
2171 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172
2173void*
2174_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2175{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002176 Py_ssize_t len;
2177 void *result;
2178 unsigned int skind;
2179
Benjamin Petersonbac79492012-01-14 13:34:47 -05002180 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 return NULL;
2182
2183 len = PyUnicode_GET_LENGTH(s);
2184 skind = PyUnicode_KIND(s);
2185 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002186 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 return NULL;
2188 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002189 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002190 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002191 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002192 if (!result)
2193 return PyErr_NoMemory();
2194 assert(skind == PyUnicode_1BYTE_KIND);
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS1, Py_UCS2,
2197 PyUnicode_1BYTE_DATA(s),
2198 PyUnicode_1BYTE_DATA(s) + len,
2199 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002201 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002202 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 if (!result)
2204 return PyErr_NoMemory();
2205 if (skind == PyUnicode_2BYTE_KIND) {
2206 _PyUnicode_CONVERT_BYTES(
2207 Py_UCS2, Py_UCS4,
2208 PyUnicode_2BYTE_DATA(s),
2209 PyUnicode_2BYTE_DATA(s) + len,
2210 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002212 else {
2213 assert(skind == PyUnicode_1BYTE_KIND);
2214 _PyUnicode_CONVERT_BYTES(
2215 Py_UCS1, Py_UCS4,
2216 PyUnicode_1BYTE_DATA(s),
2217 PyUnicode_1BYTE_DATA(s) + len,
2218 result);
2219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002221 default:
2222 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Victor Stinner01698042011-10-04 00:04:26 +02002224 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return NULL;
2226}
2227
2228static Py_UCS4*
2229as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2230 int copy_null)
2231{
2232 int kind;
2233 void *data;
2234 Py_ssize_t len, targetlen;
2235 if (PyUnicode_READY(string) == -1)
2236 return NULL;
2237 kind = PyUnicode_KIND(string);
2238 data = PyUnicode_DATA(string);
2239 len = PyUnicode_GET_LENGTH(string);
2240 targetlen = len;
2241 if (copy_null)
2242 targetlen++;
2243 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002244 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Victor Stinner15a11362012-10-06 23:48:20 +02002314/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002315 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2316 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2317#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002318
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002319static int
2320unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2321 Py_ssize_t width, Py_ssize_t precision)
2322{
2323 Py_ssize_t length, fill, arglen;
2324 Py_UCS4 maxchar;
2325
2326 if (PyUnicode_READY(str) == -1)
2327 return -1;
2328
2329 length = PyUnicode_GET_LENGTH(str);
2330 if ((precision == -1 || precision >= length)
2331 && width <= length)
2332 return _PyUnicodeWriter_WriteStr(writer, str);
2333
2334 if (precision != -1)
2335 length = Py_MIN(precision, length);
2336
2337 arglen = Py_MAX(length, width);
2338 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2339 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2340 else
2341 maxchar = writer->maxchar;
2342
2343 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2344 return -1;
2345
2346 if (width > length) {
2347 fill = width - length;
2348 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2349 return -1;
2350 writer->pos += fill;
2351 }
2352
2353 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2354 str, 0, length);
2355 writer->pos += length;
2356 return 0;
2357}
2358
2359static int
2360unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2361 Py_ssize_t width, Py_ssize_t precision)
2362{
2363 /* UTF-8 */
2364 Py_ssize_t length;
2365 PyObject *unicode;
2366 int res;
2367
2368 length = strlen(str);
2369 if (precision != -1)
2370 length = Py_MIN(length, precision);
2371 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2372 if (unicode == NULL)
2373 return -1;
2374
2375 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2376 Py_DECREF(unicode);
2377 return res;
2378}
2379
Victor Stinner96865452011-03-01 23:44:09 +00002380static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002381unicode_fromformat_arg(_PyUnicodeWriter *writer,
2382 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002383{
Victor Stinnere215d962012-10-06 23:03:36 +02002384 const char *p;
2385 Py_ssize_t len;
2386 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002387 Py_ssize_t width;
2388 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002389 int longflag;
2390 int longlongflag;
2391 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002392 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002393
2394 p = f;
2395 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002396 zeropad = 0;
2397 if (*f == '0') {
2398 zeropad = 1;
2399 f++;
2400 }
Victor Stinner96865452011-03-01 23:44:09 +00002401
2402 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002403 width = -1;
2404 if (Py_ISDIGIT((unsigned)*f)) {
2405 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002406 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002407 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002408 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002409 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002411 return NULL;
2412 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002414 f++;
2415 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 }
2417 precision = -1;
2418 if (*f == '.') {
2419 f++;
2420 if (Py_ISDIGIT((unsigned)*f)) {
2421 precision = (*f - '0');
2422 f++;
2423 while (Py_ISDIGIT((unsigned)*f)) {
2424 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2425 PyErr_SetString(PyExc_ValueError,
2426 "precision too big");
2427 return NULL;
2428 }
2429 precision = (precision * 10) + (*f - '0');
2430 f++;
2431 }
2432 }
Victor Stinner96865452011-03-01 23:44:09 +00002433 if (*f == '%') {
2434 /* "%.3%s" => f points to "3" */
2435 f--;
2436 }
2437 }
2438 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002439 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002440 f--;
2441 }
Victor Stinner96865452011-03-01 23:44:09 +00002442
2443 /* Handle %ld, %lu, %lld and %llu. */
2444 longflag = 0;
2445 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002446 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002447 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002448 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002449 longflag = 1;
2450 ++f;
2451 }
2452#ifdef HAVE_LONG_LONG
2453 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002454 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002455 longlongflag = 1;
2456 f += 2;
2457 }
2458#endif
2459 }
2460 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002461 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002462 size_tflag = 1;
2463 ++f;
2464 }
Victor Stinnere215d962012-10-06 23:03:36 +02002465
2466 if (f[1] == '\0')
2467 writer->overallocate = 0;
2468
2469 switch (*f) {
2470 case 'c':
2471 {
2472 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002473 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002474 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002475 "character argument not in range(0x110000)");
2476 return NULL;
2477 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002478 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002479 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002480 break;
2481 }
2482
2483 case 'i':
2484 case 'd':
2485 case 'u':
2486 case 'x':
2487 {
2488 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002489 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002490 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002493 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002494 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002495 va_arg(*vargs, unsigned long));
2496#ifdef HAVE_LONG_LONG
2497 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002498 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002499 va_arg(*vargs, unsigned PY_LONG_LONG));
2500#endif
2501 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002502 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002503 va_arg(*vargs, size_t));
2504 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002505 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002506 va_arg(*vargs, unsigned int));
2507 }
2508 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002509 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002510 }
2511 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002512 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002513 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002514 va_arg(*vargs, long));
2515#ifdef HAVE_LONG_LONG
2516 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002517 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002518 va_arg(*vargs, PY_LONG_LONG));
2519#endif
2520 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002521 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002522 va_arg(*vargs, Py_ssize_t));
2523 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002524 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002525 va_arg(*vargs, int));
2526 }
2527 assert(len >= 0);
2528
Victor Stinnere215d962012-10-06 23:03:36 +02002529 if (precision < len)
2530 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002531
2532 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2534 return NULL;
2535
Victor Stinnere215d962012-10-06 23:03:36 +02002536 if (width > precision) {
2537 Py_UCS4 fillchar;
2538 fill = width - precision;
2539 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2541 return NULL;
2542 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002543 }
Victor Stinner15a11362012-10-06 23:48:20 +02002544 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002545 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002546 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2547 return NULL;
2548 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002549 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002550
Victor Stinner4a587072013-11-19 12:54:53 +01002551 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2552 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002553 break;
2554 }
2555
2556 case 'p':
2557 {
2558 char number[MAX_LONG_LONG_CHARS];
2559
2560 len = sprintf(number, "%p", va_arg(*vargs, void*));
2561 assert(len >= 0);
2562
2563 /* %p is ill-defined: ensure leading 0x. */
2564 if (number[1] == 'X')
2565 number[1] = 'x';
2566 else if (number[1] != 'x') {
2567 memmove(number + 2, number,
2568 strlen(number) + 1);
2569 number[0] = '0';
2570 number[1] = 'x';
2571 len += 2;
2572 }
2573
Victor Stinner4a587072013-11-19 12:54:53 +01002574 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002575 return NULL;
2576 break;
2577 }
2578
2579 case 's':
2580 {
2581 /* UTF-8 */
2582 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002584 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002585 break;
2586 }
2587
2588 case 'U':
2589 {
2590 PyObject *obj = va_arg(*vargs, PyObject *);
2591 assert(obj && _PyUnicode_CHECK(obj));
2592
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002594 return NULL;
2595 break;
2596 }
2597
2598 case 'V':
2599 {
2600 PyObject *obj = va_arg(*vargs, PyObject *);
2601 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002602 if (obj) {
2603 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002605 return NULL;
2606 }
2607 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 assert(str != NULL);
2609 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002611 }
2612 break;
2613 }
2614
2615 case 'S':
2616 {
2617 PyObject *obj = va_arg(*vargs, PyObject *);
2618 PyObject *str;
2619 assert(obj);
2620 str = PyObject_Str(obj);
2621 if (!str)
2622 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002623 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002624 Py_DECREF(str);
2625 return NULL;
2626 }
2627 Py_DECREF(str);
2628 break;
2629 }
2630
2631 case 'R':
2632 {
2633 PyObject *obj = va_arg(*vargs, PyObject *);
2634 PyObject *repr;
2635 assert(obj);
2636 repr = PyObject_Repr(obj);
2637 if (!repr)
2638 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002640 Py_DECREF(repr);
2641 return NULL;
2642 }
2643 Py_DECREF(repr);
2644 break;
2645 }
2646
2647 case 'A':
2648 {
2649 PyObject *obj = va_arg(*vargs, PyObject *);
2650 PyObject *ascii;
2651 assert(obj);
2652 ascii = PyObject_ASCII(obj);
2653 if (!ascii)
2654 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002655 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002656 Py_DECREF(ascii);
2657 return NULL;
2658 }
2659 Py_DECREF(ascii);
2660 break;
2661 }
2662
2663 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667
2668 default:
2669 /* if we stumble upon an unknown formatting code, copy the rest
2670 of the format string to the output string. (we cannot just
2671 skip the code, since there's no way to know what's in the
2672 argument list) */
2673 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002674 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002675 return NULL;
2676 f = p+len;
2677 return f;
2678 }
2679
2680 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002681 return f;
2682}
2683
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684PyObject *
2685PyUnicode_FromFormatV(const char *format, va_list vargs)
2686{
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_list vargs2;
2688 const char *f;
2689 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002690
Victor Stinner8f674cc2013-04-17 23:02:17 +02002691 _PyUnicodeWriter_Init(&writer);
2692 writer.min_length = strlen(format) + 100;
2693 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2696 Copy it to be able to pass a reference to a subfunction. */
2697 Py_VA_COPY(vargs2, vargs);
2698
2699 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002701 f = unicode_fromformat_arg(&writer, f, &vargs2);
2702 if (f == NULL)
2703 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002705 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002706 const char *p;
2707 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
Victor Stinnere215d962012-10-06 23:03:36 +02002709 p = f;
2710 do
2711 {
2712 if ((unsigned char)*p > 127) {
2713 PyErr_Format(PyExc_ValueError,
2714 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2715 "string, got a non-ASCII byte: 0x%02x",
2716 (unsigned char)*p);
2717 return NULL;
2718 }
2719 p++;
2720 }
2721 while (*p != '\0' && *p != '%');
2722 len = p - f;
2723
2724 if (*p == '\0')
2725 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002726
2727 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002728 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002732 }
Victor Stinnere215d962012-10-06 23:03:36 +02002733 return _PyUnicodeWriter_Finish(&writer);
2734
2735 fail:
2736 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002738}
2739
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740PyObject *
2741PyUnicode_FromFormat(const char *format, ...)
2742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 PyObject* ret;
2744 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745
2746#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002748#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 ret = PyUnicode_FromFormatV(format, vargs);
2752 va_end(vargs);
2753 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002754}
2755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756#ifdef HAVE_WCHAR_H
2757
Victor Stinner5593d8a2010-10-02 11:11:27 +00002758/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2759 convert a Unicode object to a wide character string.
2760
Victor Stinnerd88d9832011-09-06 02:00:05 +02002761 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002762 character) required to convert the unicode object. Ignore size argument.
2763
Victor Stinnerd88d9832011-09-06 02:00:05 +02002764 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002765 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002768unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002769 wchar_t *w,
2770 Py_ssize_t size)
2771{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002772 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 const wchar_t *wstr;
2774
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002775 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 if (wstr == NULL)
2777 return -1;
2778
Victor Stinner5593d8a2010-10-02 11:11:27 +00002779 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 if (size > res)
2781 size = res + 1;
2782 else
2783 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 return res;
2786 }
2787 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002789}
2790
2791Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002792PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002793 wchar_t *w,
2794 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795{
2796 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 PyErr_BadInternalCall();
2798 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002800 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801}
2802
Victor Stinner137c34c2010-09-29 10:25:54 +00002803wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002804PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002805 Py_ssize_t *size)
2806{
2807 wchar_t* buffer;
2808 Py_ssize_t buflen;
2809
2810 if (unicode == NULL) {
2811 PyErr_BadInternalCall();
2812 return NULL;
2813 }
2814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002815 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 if (buflen == -1)
2817 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002818 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002819 if (buffer == NULL) {
2820 PyErr_NoMemory();
2821 return NULL;
2822 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002823 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002824 if (buflen == -1) {
2825 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002826 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002827 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 if (size != NULL)
2829 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 return buffer;
2831}
2832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
Alexander Belopolsky40018472011-02-26 01:02:56 +00002835PyObject *
2836PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002837{
Victor Stinner8faf8212011-12-08 22:14:11 +01002838 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 PyErr_SetString(PyExc_ValueError,
2840 "chr() arg not in range(0x110000)");
2841 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002843
Victor Stinner985a82a2014-01-03 12:53:47 +01002844 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002845}
2846
Alexander Belopolsky40018472011-02-26 01:02:56 +00002847PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002848PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002850 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002852 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002853 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002854 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 Py_INCREF(obj);
2856 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 }
2858 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 /* For a Unicode subtype that's not a Unicode object,
2860 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002861 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002862 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002863 PyErr_Format(PyExc_TypeError,
2864 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002865 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002866 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002867}
2868
Alexander Belopolsky40018472011-02-26 01:02:56 +00002869PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002870PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002871 const char *encoding,
2872 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002873{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002875 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002876
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 PyErr_BadInternalCall();
2879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002881
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002882 /* Decoding bytes objects is the most common case and should be fast */
2883 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002884 if (PyBytes_GET_SIZE(obj) == 0)
2885 _Py_RETURN_UNICODE_EMPTY();
2886 v = PyUnicode_Decode(
2887 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2888 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 return v;
2890 }
2891
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyErr_SetString(PyExc_TypeError,
2894 "decoding str is not supported");
2895 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002898 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2899 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2900 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002901 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002902 Py_TYPE(obj)->tp_name);
2903 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002904 }
Tim Petersced69f82003-09-16 20:30:58 +00002905
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002906 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002907 PyBuffer_Release(&buffer);
2908 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002910
Serhiy Storchaka05997252013-01-26 12:14:02 +02002911 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002912 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914}
2915
Victor Stinner600d3be2010-06-10 12:00:55 +00002916/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002917 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2918 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002919int
2920_Py_normalize_encoding(const char *encoding,
2921 char *lower,
2922 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002924 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002925 char *l;
2926 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002928 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002929 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002930 if (lower_len < 6)
2931 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002932 strcpy(lower, "utf-8");
2933 return 1;
2934 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002935 e = encoding;
2936 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002937 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002938 while (*e) {
2939 if (l == l_end)
2940 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002941 if (Py_ISUPPER(*e)) {
2942 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002943 }
2944 else if (*e == '_') {
2945 *l++ = '-';
2946 e++;
2947 }
2948 else {
2949 *l++ = *e++;
2950 }
2951 }
2952 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002953 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 Py_ssize_t size,
2959 const char *encoding,
2960 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002961{
2962 PyObject *buffer = NULL, *unicode;
2963 Py_buffer info;
2964 char lower[11]; /* Enough for any encoding shortcut */
2965
Fred Drakee4315f52000-05-09 19:53:39 +00002966 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002967 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002968 if ((strcmp(lower, "utf-8") == 0) ||
2969 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002970 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002971 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002972 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002973 (strcmp(lower, "iso-8859-1") == 0) ||
2974 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002975 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002976#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002977 else if (strcmp(lower, "mbcs") == 0)
2978 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002979#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002980 else if (strcmp(lower, "ascii") == 0)
2981 return PyUnicode_DecodeASCII(s, size, errors);
2982 else if (strcmp(lower, "utf-16") == 0)
2983 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2984 else if (strcmp(lower, "utf-32") == 0)
2985 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987
2988 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002989 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002990 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002991 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002992 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 if (buffer == NULL)
2994 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002995 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 if (unicode == NULL)
2997 goto onError;
2998 if (!PyUnicode_Check(unicode)) {
2999 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003000 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3001 "use codecs.decode() to decode to arbitrary types",
3002 encoding,
3003 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 Py_DECREF(unicode);
3005 goto onError;
3006 }
3007 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003008 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 Py_XDECREF(buffer);
3012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003034 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003035
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003037 return NULL;
3038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 const char *encoding,
3043 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044{
3045 PyObject *v;
3046
3047 if (!PyUnicode_Check(unicode)) {
3048 PyErr_BadArgument();
3049 goto onError;
3050 }
3051
3052 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003054
3055 /* Decode via the codec registry */
3056 v = PyCodec_Decode(unicode, encoding, errors);
3057 if (v == NULL)
3058 goto onError;
3059 if (!PyUnicode_Check(v)) {
3060 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003061 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3062 "use codecs.decode() to decode to arbitrary types",
3063 encoding,
3064 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003065 Py_DECREF(v);
3066 goto onError;
3067 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003068 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003069
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003071 return NULL;
3072}
3073
Alexander Belopolsky40018472011-02-26 01:02:56 +00003074PyObject *
3075PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003076 Py_ssize_t size,
3077 const char *encoding,
3078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079{
3080 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 unicode = PyUnicode_FromUnicode(s, size);
3083 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3086 Py_DECREF(unicode);
3087 return v;
3088}
3089
Alexander Belopolsky40018472011-02-26 01:02:56 +00003090PyObject *
3091PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003092 const char *encoding,
3093 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094{
3095 PyObject *v;
3096
3097 if (!PyUnicode_Check(unicode)) {
3098 PyErr_BadArgument();
3099 goto onError;
3100 }
3101
3102 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003104
3105 /* Encode via the codec registry */
3106 v = PyCodec_Encode(unicode, encoding, errors);
3107 if (v == NULL)
3108 goto onError;
3109 return v;
3110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003112 return NULL;
3113}
3114
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003115static size_t
3116wcstombs_errorpos(const wchar_t *wstr)
3117{
3118 size_t len;
3119#if SIZEOF_WCHAR_T == 2
3120 wchar_t buf[3];
3121#else
3122 wchar_t buf[2];
3123#endif
3124 char outbuf[MB_LEN_MAX];
3125 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127#if SIZEOF_WCHAR_T == 2
3128 buf[2] = 0;
3129#else
3130 buf[1] = 0;
3131#endif
3132 start = wstr;
3133 while (*wstr != L'\0')
3134 {
3135 previous = wstr;
3136#if SIZEOF_WCHAR_T == 2
3137 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3138 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3139 {
3140 buf[0] = wstr[0];
3141 buf[1] = wstr[1];
3142 wstr += 2;
3143 }
3144 else {
3145 buf[0] = *wstr;
3146 buf[1] = 0;
3147 wstr++;
3148 }
3149#else
3150 buf[0] = *wstr;
3151 wstr++;
3152#endif
3153 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003154 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003155 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156 }
3157
3158 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003159 return 0;
3160}
3161
Victor Stinner1b579672011-12-17 05:47:23 +01003162static int
3163locale_error_handler(const char *errors, int *surrogateescape)
3164{
3165 if (errors == NULL) {
3166 *surrogateescape = 0;
3167 return 0;
3168 }
3169
3170 if (strcmp(errors, "strict") == 0) {
3171 *surrogateescape = 0;
3172 return 0;
3173 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003174 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003175 *surrogateescape = 1;
3176 return 0;
3177 }
3178 PyErr_Format(PyExc_ValueError,
3179 "only 'strict' and 'surrogateescape' error handlers "
3180 "are supported, not '%s'",
3181 errors);
3182 return -1;
3183}
3184
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003185PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003186PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187{
3188 Py_ssize_t wlen, wlen2;
3189 wchar_t *wstr;
3190 PyObject *bytes = NULL;
3191 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003192 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003193 PyObject *exc;
3194 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003195 int surrogateescape;
3196
3197 if (locale_error_handler(errors, &surrogateescape) < 0)
3198 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003199
3200 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3201 if (wstr == NULL)
3202 return NULL;
3203
3204 wlen2 = wcslen(wstr);
3205 if (wlen2 != wlen) {
3206 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003207 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208 return NULL;
3209 }
3210
3211 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003212 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 char *str;
3214
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003215 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 if (str == NULL) {
3217 if (error_pos == (size_t)-1) {
3218 PyErr_NoMemory();
3219 PyMem_Free(wstr);
3220 return NULL;
3221 }
3222 else {
3223 goto encode_error;
3224 }
3225 }
3226 PyMem_Free(wstr);
3227
3228 bytes = PyBytes_FromString(str);
3229 PyMem_Free(str);
3230 }
3231 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003232 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233 size_t len, len2;
3234
3235 len = wcstombs(NULL, wstr, 0);
3236 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003237 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 goto encode_error;
3239 }
3240
3241 bytes = PyBytes_FromStringAndSize(NULL, len);
3242 if (bytes == NULL) {
3243 PyMem_Free(wstr);
3244 return NULL;
3245 }
3246
3247 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3248 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003249 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250 goto encode_error;
3251 }
3252 PyMem_Free(wstr);
3253 }
3254 return bytes;
3255
3256encode_error:
3257 errmsg = strerror(errno);
3258 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003259
3260 if (error_pos == (size_t)-1)
3261 error_pos = wcstombs_errorpos(wstr);
3262
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263 PyMem_Free(wstr);
3264 Py_XDECREF(bytes);
3265
Victor Stinner2f197072011-12-17 07:08:30 +01003266 if (errmsg != NULL) {
3267 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003268 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003269 if (wstr != NULL) {
3270 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003271 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003272 } else
3273 errmsg = NULL;
3274 }
3275 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003276 reason = PyUnicode_FromString(
3277 "wcstombs() encountered an unencodable "
3278 "wide character");
3279 if (reason == NULL)
3280 return NULL;
3281
3282 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3283 "locale", unicode,
3284 (Py_ssize_t)error_pos,
3285 (Py_ssize_t)(error_pos+1),
3286 reason);
3287 Py_DECREF(reason);
3288 if (exc != NULL) {
3289 PyCodec_StrictErrors(exc);
3290 Py_XDECREF(exc);
3291 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 return NULL;
3293}
3294
Victor Stinnerad158722010-10-27 00:25:46 +00003295PyObject *
3296PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003297{
Victor Stinner99b95382011-07-04 14:23:54 +02003298#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003299 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003300#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003302#else
Victor Stinner793b5312011-04-27 00:24:21 +02003303 PyInterpreterState *interp = PyThreadState_GET()->interp;
3304 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3305 cannot use it to encode and decode filenames before it is loaded. Load
3306 the Python codec requires to encode at least its own filename. Use the C
3307 version of the locale codec until the codec registry is initialized and
3308 the Python codec is loaded.
3309
3310 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3311 cannot only rely on it: check also interp->fscodec_initialized for
3312 subinterpreters. */
3313 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003314 return PyUnicode_AsEncodedString(unicode,
3315 Py_FileSystemDefaultEncoding,
3316 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003317 }
3318 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003319 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003320 }
Victor Stinnerad158722010-10-27 00:25:46 +00003321#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003322}
3323
Alexander Belopolsky40018472011-02-26 01:02:56 +00003324PyObject *
3325PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003326 const char *encoding,
3327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328{
3329 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003330 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (!PyUnicode_Check(unicode)) {
3333 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
Fred Drakee4315f52000-05-09 19:53:39 +00003336
Fred Drakee4315f52000-05-09 19:53:39 +00003337 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003338 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003339 if ((strcmp(lower, "utf-8") == 0) ||
3340 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003341 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003342 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003344 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003346 }
Victor Stinner37296e82010-06-10 13:36:23 +00003347 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003348 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003349 (strcmp(lower, "iso-8859-1") == 0) ||
3350 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003352#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003353 else if (strcmp(lower, "mbcs") == 0)
3354 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003355#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003356 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
3360 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003361 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003363 return NULL;
3364
3365 /* The normal path */
3366 if (PyBytes_Check(v))
3367 return v;
3368
3369 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003371 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003372 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003373
3374 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003375 "encoder %s returned bytearray instead of bytes; "
3376 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003377 encoding);
3378 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003379 Py_DECREF(v);
3380 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003382
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003383 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3384 Py_DECREF(v);
3385 return b;
3386 }
3387
3388 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003389 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3390 "use codecs.encode() to encode to arbitrary types",
3391 encoding,
3392 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003393 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003394 return NULL;
3395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
3409 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411
3412 /* Encode via the codec registry */
3413 v = PyCodec_Encode(unicode, encoding, errors);
3414 if (v == NULL)
3415 goto onError;
3416 if (!PyUnicode_Check(v)) {
3417 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003418 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3419 "use codecs.encode() to encode to arbitrary types",
3420 encoding,
3421 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 Py_DECREF(v);
3423 goto onError;
3424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003426
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 return NULL;
3429}
3430
Victor Stinner2f197072011-12-17 07:08:30 +01003431static size_t
3432mbstowcs_errorpos(const char *str, size_t len)
3433{
3434#ifdef HAVE_MBRTOWC
3435 const char *start = str;
3436 mbstate_t mbs;
3437 size_t converted;
3438 wchar_t ch;
3439
3440 memset(&mbs, 0, sizeof mbs);
3441 while (len)
3442 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003443 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003444 if (converted == 0)
3445 /* Reached end of string */
3446 break;
3447 if (converted == (size_t)-1 || converted == (size_t)-2) {
3448 /* Conversion error or incomplete character */
3449 return str - start;
3450 }
3451 else {
3452 str += converted;
3453 len -= converted;
3454 }
3455 }
3456 /* failed to find the undecodable byte sequence */
3457 return 0;
3458#endif
3459 return 0;
3460}
3461
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003462PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003463PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003464 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003465{
3466 wchar_t smallbuf[256];
3467 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3468 wchar_t *wstr;
3469 size_t wlen, wlen2;
3470 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003471 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003472 size_t error_pos;
3473 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003474 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3475 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003476
3477 if (locale_error_handler(errors, &surrogateescape) < 0)
3478 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003479
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003480 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3481 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482 return NULL;
3483 }
3484
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003485 if (surrogateescape) {
3486 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003487 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 if (wstr == NULL) {
3489 if (wlen == (size_t)-1)
3490 PyErr_NoMemory();
3491 else
3492 PyErr_SetFromErrno(PyExc_OSError);
3493 return NULL;
3494 }
3495
3496 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003497 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003498 }
3499 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003500 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501#ifndef HAVE_BROKEN_MBSTOWCS
3502 wlen = mbstowcs(NULL, str, 0);
3503#else
3504 wlen = len;
3505#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003506 if (wlen == (size_t)-1)
3507 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508 if (wlen+1 <= smallbuf_len) {
3509 wstr = smallbuf;
3510 }
3511 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003512 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003513 if (!wstr)
3514 return PyErr_NoMemory();
3515 }
3516
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003517 wlen2 = mbstowcs(wstr, str, wlen+1);
3518 if (wlen2 == (size_t)-1) {
3519 if (wstr != smallbuf)
3520 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003521 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 }
3523#ifdef HAVE_BROKEN_MBSTOWCS
3524 assert(wlen2 == wlen);
3525#endif
3526 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3527 if (wstr != smallbuf)
3528 PyMem_Free(wstr);
3529 }
3530 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003531
3532decode_error:
3533 errmsg = strerror(errno);
3534 assert(errmsg != NULL);
3535
3536 error_pos = mbstowcs_errorpos(str, len);
3537 if (errmsg != NULL) {
3538 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003539 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003540 if (wstr != NULL) {
3541 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003542 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003543 } else
3544 errmsg = NULL;
3545 }
3546 if (errmsg == NULL)
3547 reason = PyUnicode_FromString(
3548 "mbstowcs() encountered an invalid multibyte sequence");
3549 if (reason == NULL)
3550 return NULL;
3551
3552 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3553 "locale", str, len,
3554 (Py_ssize_t)error_pos,
3555 (Py_ssize_t)(error_pos+1),
3556 reason);
3557 Py_DECREF(reason);
3558 if (exc != NULL) {
3559 PyCodec_StrictErrors(exc);
3560 Py_XDECREF(exc);
3561 }
3562 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003563}
3564
3565PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003566PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003567{
3568 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003569 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003570}
3571
3572
3573PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003574PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003576 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3577}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578
Christian Heimes5894ba72007-11-04 11:43:14 +00003579PyObject*
3580PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3581{
Victor Stinner99b95382011-07-04 14:23:54 +02003582#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003583 return PyUnicode_DecodeMBCS(s, size, NULL);
3584#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003585 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003586#else
Victor Stinner793b5312011-04-27 00:24:21 +02003587 PyInterpreterState *interp = PyThreadState_GET()->interp;
3588 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3589 cannot use it to encode and decode filenames before it is loaded. Load
3590 the Python codec requires to encode at least its own filename. Use the C
3591 version of the locale codec until the codec registry is initialized and
3592 the Python codec is loaded.
3593
3594 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3595 cannot only rely on it: check also interp->fscodec_initialized for
3596 subinterpreters. */
3597 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003598 return PyUnicode_Decode(s, size,
3599 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003600 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003601 }
3602 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003603 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003604 }
Victor Stinnerad158722010-10-27 00:25:46 +00003605#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003606}
3607
Martin v. Löwis011e8422009-05-05 04:43:17 +00003608
3609int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003610_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003611{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003612 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003613
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003614 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003615 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003616 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3617 PyUnicode_GET_LENGTH(str), '\0', 1);
3618 if (pos == -1)
3619 return 0;
3620 else
3621 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003622}
3623
Antoine Pitrou13348842012-01-29 18:36:34 +01003624int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003625PyUnicode_FSConverter(PyObject* arg, void* addr)
3626{
3627 PyObject *output = NULL;
3628 Py_ssize_t size;
3629 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003630 if (arg == NULL) {
3631 Py_DECREF(*(PyObject**)addr);
3632 return 1;
3633 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003634 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003635 output = arg;
3636 Py_INCREF(output);
3637 }
3638 else {
3639 arg = PyUnicode_FromObject(arg);
3640 if (!arg)
3641 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003642 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003643 Py_DECREF(arg);
3644 if (!output)
3645 return 0;
3646 if (!PyBytes_Check(output)) {
3647 Py_DECREF(output);
3648 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3649 return 0;
3650 }
3651 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003652 size = PyBytes_GET_SIZE(output);
3653 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003654 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003655 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003656 Py_DECREF(output);
3657 return 0;
3658 }
3659 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003660 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003661}
3662
3663
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003664int
3665PyUnicode_FSDecoder(PyObject* arg, void* addr)
3666{
3667 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003668 if (arg == NULL) {
3669 Py_DECREF(*(PyObject**)addr);
3670 return 1;
3671 }
3672 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003673 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003674 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003675 output = arg;
3676 Py_INCREF(output);
3677 }
3678 else {
3679 arg = PyBytes_FromObject(arg);
3680 if (!arg)
3681 return 0;
3682 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3683 PyBytes_GET_SIZE(arg));
3684 Py_DECREF(arg);
3685 if (!output)
3686 return 0;
3687 if (!PyUnicode_Check(output)) {
3688 Py_DECREF(output);
3689 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3690 return 0;
3691 }
3692 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003693 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003694 Py_DECREF(output);
3695 return 0;
3696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003697 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003698 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003699 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003700 Py_DECREF(output);
3701 return 0;
3702 }
3703 *(PyObject**)addr = output;
3704 return Py_CLEANUP_SUPPORTED;
3705}
3706
3707
Martin v. Löwis5b222132007-06-10 09:51:05 +00003708char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003710{
Christian Heimesf3863112007-11-22 07:46:41 +00003711 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003712
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003713 if (!PyUnicode_Check(unicode)) {
3714 PyErr_BadArgument();
3715 return NULL;
3716 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003717 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003718 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003720 if (PyUnicode_UTF8(unicode) == NULL) {
3721 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3723 if (bytes == NULL)
3724 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003725 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3726 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003727 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728 Py_DECREF(bytes);
3729 return NULL;
3730 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003731 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3732 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3733 PyBytes_AS_STRING(bytes),
3734 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 Py_DECREF(bytes);
3736 }
3737
3738 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003739 *psize = PyUnicode_UTF8_LENGTH(unicode);
3740 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003741}
3742
3743char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003746 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3747}
3748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749Py_UNICODE *
3750PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3751{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 const unsigned char *one_byte;
3753#if SIZEOF_WCHAR_T == 4
3754 const Py_UCS2 *two_bytes;
3755#else
3756 const Py_UCS4 *four_bytes;
3757 const Py_UCS4 *ucs4_end;
3758 Py_ssize_t num_surrogates;
3759#endif
3760 wchar_t *w;
3761 wchar_t *wchar_end;
3762
3763 if (!PyUnicode_Check(unicode)) {
3764 PyErr_BadArgument();
3765 return NULL;
3766 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003769 assert(_PyUnicode_KIND(unicode) != 0);
3770 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3775 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 num_surrogates = 0;
3777
3778 for (; four_bytes < ucs4_end; ++four_bytes) {
3779 if (*four_bytes > 0xFFFF)
3780 ++num_surrogates;
3781 }
3782
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003783 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3784 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3785 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003786 PyErr_NoMemory();
3787 return NULL;
3788 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003789 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003791 w = _PyUnicode_WSTR(unicode);
3792 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3793 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3795 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003796 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003798 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3799 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 }
3801 else
3802 *w = *four_bytes;
3803
3804 if (w > wchar_end) {
3805 assert(0 && "Miscalculated string end");
3806 }
3807 }
3808 *w = 0;
3809#else
3810 /* sizeof(wchar_t) == 4 */
3811 Py_FatalError("Impossible unicode object state, wstr and str "
3812 "should share memory already.");
3813 return NULL;
3814#endif
3815 }
3816 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003817 if ((size_t)_PyUnicode_LENGTH(unicode) >
3818 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3819 PyErr_NoMemory();
3820 return NULL;
3821 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003822 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3823 (_PyUnicode_LENGTH(unicode) + 1));
3824 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 PyErr_NoMemory();
3826 return NULL;
3827 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003828 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3829 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3830 w = _PyUnicode_WSTR(unicode);
3831 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3834 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 for (; w < wchar_end; ++one_byte, ++w)
3836 *w = *one_byte;
3837 /* null-terminate the wstr */
3838 *w = 0;
3839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 for (; w < wchar_end; ++two_bytes, ++w)
3844 *w = *two_bytes;
3845 /* null-terminate the wstr */
3846 *w = 0;
3847#else
3848 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003849 PyObject_FREE(_PyUnicode_WSTR(unicode));
3850 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003851 Py_FatalError("Impossible unicode object state, wstr "
3852 "and str should share memory already.");
3853 return NULL;
3854#endif
3855 }
3856 else {
3857 assert(0 && "This should never happen.");
3858 }
3859 }
3860 }
3861 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003862 *size = PyUnicode_WSTR_LENGTH(unicode);
3863 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003864}
3865
Alexander Belopolsky40018472011-02-26 01:02:56 +00003866Py_UNICODE *
3867PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870}
3871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872
Alexander Belopolsky40018472011-02-26 01:02:56 +00003873Py_ssize_t
3874PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875{
3876 if (!PyUnicode_Check(unicode)) {
3877 PyErr_BadArgument();
3878 goto onError;
3879 }
3880 return PyUnicode_GET_SIZE(unicode);
3881
Benjamin Peterson29060642009-01-31 22:14:21 +00003882 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 return -1;
3884}
3885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886Py_ssize_t
3887PyUnicode_GetLength(PyObject *unicode)
3888{
Victor Stinner07621332012-06-16 04:53:46 +02003889 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 PyErr_BadArgument();
3891 return -1;
3892 }
Victor Stinner07621332012-06-16 04:53:46 +02003893 if (PyUnicode_READY(unicode) == -1)
3894 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 return PyUnicode_GET_LENGTH(unicode);
3896}
3897
3898Py_UCS4
3899PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3900{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003901 void *data;
3902 int kind;
3903
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003904 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3905 PyErr_BadArgument();
3906 return (Py_UCS4)-1;
3907 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003908 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003909 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910 return (Py_UCS4)-1;
3911 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003912 data = PyUnicode_DATA(unicode);
3913 kind = PyUnicode_KIND(unicode);
3914 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915}
3916
3917int
3918PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3919{
3920 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003921 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 return -1;
3923 }
Victor Stinner488fa492011-12-12 00:01:39 +01003924 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003925 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003926 PyErr_SetString(PyExc_IndexError, "string index out of range");
3927 return -1;
3928 }
Victor Stinner488fa492011-12-12 00:01:39 +01003929 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003930 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003931 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3932 PyErr_SetString(PyExc_ValueError, "character out of range");
3933 return -1;
3934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3936 index, ch);
3937 return 0;
3938}
3939
Alexander Belopolsky40018472011-02-26 01:02:56 +00003940const char *
3941PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003942{
Victor Stinner42cb4622010-09-01 19:39:01 +00003943 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003944}
3945
Victor Stinner554f3f02010-06-16 23:33:54 +00003946/* create or adjust a UnicodeDecodeError */
3947static void
3948make_decode_exception(PyObject **exceptionObject,
3949 const char *encoding,
3950 const char *input, Py_ssize_t length,
3951 Py_ssize_t startpos, Py_ssize_t endpos,
3952 const char *reason)
3953{
3954 if (*exceptionObject == NULL) {
3955 *exceptionObject = PyUnicodeDecodeError_Create(
3956 encoding, input, length, startpos, endpos, reason);
3957 }
3958 else {
3959 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3960 goto onError;
3961 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3962 goto onError;
3963 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3964 goto onError;
3965 }
3966 return;
3967
3968onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003969 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003970}
3971
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003972#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973/* error handling callback helper:
3974 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003975 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 and adjust various state variables.
3977 return 0 on success, -1 on error
3978*/
3979
Alexander Belopolsky40018472011-02-26 01:02:56 +00003980static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003981unicode_decode_call_errorhandler_wchar(
3982 const char *errors, PyObject **errorHandler,
3983 const char *encoding, const char *reason,
3984 const char **input, const char **inend, Py_ssize_t *startinpos,
3985 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3986 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003988 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989
3990 PyObject *restuple = NULL;
3991 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003992 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003993 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003994 Py_ssize_t requiredsize;
3995 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003996 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003997 wchar_t *repwstr;
3998 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004000 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4001 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 *errorHandler = PyCodec_LookupError(errors);
4005 if (*errorHandler == NULL)
4006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 }
4008
Victor Stinner554f3f02010-06-16 23:33:54 +00004009 make_decode_exception(exceptionObject,
4010 encoding,
4011 *input, *inend - *input,
4012 *startinpos, *endinpos,
4013 reason);
4014 if (*exceptionObject == NULL)
4015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016
4017 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4018 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004021 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 }
4024 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004026
4027 /* Copy back the bytes variables, which might have been modified by the
4028 callback */
4029 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4030 if (!inputobj)
4031 goto onError;
4032 if (!PyBytes_Check(inputobj)) {
4033 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4034 }
4035 *input = PyBytes_AS_STRING(inputobj);
4036 insize = PyBytes_GET_SIZE(inputobj);
4037 *inend = *input + insize;
4038 /* we can DECREF safely, as the exception has another reference,
4039 so the object won't go away. */
4040 Py_DECREF(inputobj);
4041
4042 if (newpos<0)
4043 newpos = insize+newpos;
4044 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004045 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004046 goto onError;
4047 }
4048
4049 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4050 if (repwstr == NULL)
4051 goto onError;
4052 /* need more space? (at least enough for what we
4053 have+the replacement+the rest of the string (starting
4054 at the new input position), so we won't have to check space
4055 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004056 requiredsize = *outpos;
4057 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4058 goto overflow;
4059 requiredsize += repwlen;
4060 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4061 goto overflow;
4062 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004063 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004064 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004065 requiredsize = 2*outsize;
4066 if (unicode_resize(output, requiredsize) < 0)
4067 goto onError;
4068 }
4069 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4070 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004071 *endinpos = newpos;
4072 *inptr = *input + newpos;
4073
4074 /* we made it! */
4075 Py_XDECREF(restuple);
4076 return 0;
4077
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004078 overflow:
4079 PyErr_SetString(PyExc_OverflowError,
4080 "decoded result is too long for a Python string");
4081
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004082 onError:
4083 Py_XDECREF(restuple);
4084 return -1;
4085}
4086#endif /* HAVE_MBCS */
4087
4088static int
4089unicode_decode_call_errorhandler_writer(
4090 const char *errors, PyObject **errorHandler,
4091 const char *encoding, const char *reason,
4092 const char **input, const char **inend, Py_ssize_t *startinpos,
4093 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4094 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4095{
4096 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4097
4098 PyObject *restuple = NULL;
4099 PyObject *repunicode = NULL;
4100 Py_ssize_t insize;
4101 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004102 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004103 PyObject *inputobj = NULL;
4104
4105 if (*errorHandler == NULL) {
4106 *errorHandler = PyCodec_LookupError(errors);
4107 if (*errorHandler == NULL)
4108 goto onError;
4109 }
4110
4111 make_decode_exception(exceptionObject,
4112 encoding,
4113 *input, *inend - *input,
4114 *startinpos, *endinpos,
4115 reason);
4116 if (*exceptionObject == NULL)
4117 goto onError;
4118
4119 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4120 if (restuple == NULL)
4121 goto onError;
4122 if (!PyTuple_Check(restuple)) {
4123 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4124 goto onError;
4125 }
4126 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004127 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004128
4129 /* Copy back the bytes variables, which might have been modified by the
4130 callback */
4131 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4132 if (!inputobj)
4133 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004134 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004136 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004137 *input = PyBytes_AS_STRING(inputobj);
4138 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004139 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004140 /* we can DECREF safely, as the exception has another reference,
4141 so the object won't go away. */
4142 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004146 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004147 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150
Victor Stinner8f674cc2013-04-17 23:02:17 +02004151 if (PyUnicode_READY(repunicode) < 0)
4152 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004153 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004154 if (replen > 1) {
4155 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004156 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004157 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4158 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4159 goto onError;
4160 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004161 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004162 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004165 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004166
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004168 Py_XDECREF(restuple);
4169 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004173 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174}
4175
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004176/* --- UTF-7 Codec -------------------------------------------------------- */
4177
Antoine Pitrou244651a2009-05-04 18:56:13 +00004178/* See RFC2152 for details. We encode conservatively and decode liberally. */
4179
4180/* Three simple macros defining base-64. */
4181
4182/* Is c a base-64 character? */
4183
4184#define IS_BASE64(c) \
4185 (((c) >= 'A' && (c) <= 'Z') || \
4186 ((c) >= 'a' && (c) <= 'z') || \
4187 ((c) >= '0' && (c) <= '9') || \
4188 (c) == '+' || (c) == '/')
4189
4190/* given that c is a base-64 character, what is its base-64 value? */
4191
4192#define FROM_BASE64(c) \
4193 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4194 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4195 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4196 (c) == '+' ? 62 : 63)
4197
4198/* What is the base-64 character of the bottom 6 bits of n? */
4199
4200#define TO_BASE64(n) \
4201 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4202
4203/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4204 * decoded as itself. We are permissive on decoding; the only ASCII
4205 * byte not decoding to itself is the + which begins a base64
4206 * string. */
4207
4208#define DECODE_DIRECT(c) \
4209 ((c) <= 127 && (c) != '+')
4210
4211/* The UTF-7 encoder treats ASCII characters differently according to
4212 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4213 * the above). See RFC2152. This array identifies these different
4214 * sets:
4215 * 0 : "Set D"
4216 * alphanumeric and '(),-./:?
4217 * 1 : "Set O"
4218 * !"#$%&*;<=>@[]^_`{|}
4219 * 2 : "whitespace"
4220 * ht nl cr sp
4221 * 3 : special (must be base64 encoded)
4222 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4223 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004224
Tim Petersced69f82003-09-16 20:30:58 +00004225static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004226char utf7_category[128] = {
4227/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4228 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4229/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4230 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4231/* sp ! " # $ % & ' ( ) * + , - . / */
4232 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4233/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4235/* @ A B C D E F G H I J K L M N O */
4236 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4237/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4239/* ` a b c d e f g h i j k l m n o */
4240 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4241/* p q r s t u v w x y z { | } ~ del */
4242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004243};
4244
Antoine Pitrou244651a2009-05-04 18:56:13 +00004245/* ENCODE_DIRECT: this character should be encoded as itself. The
4246 * answer depends on whether we are encoding set O as itself, and also
4247 * on whether we are encoding whitespace as itself. RFC2152 makes it
4248 * clear that the answers to these questions vary between
4249 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004250
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251#define ENCODE_DIRECT(c, directO, directWS) \
4252 ((c) < 128 && (c) > 0 && \
4253 ((utf7_category[(c)] == 0) || \
4254 (directWS && (utf7_category[(c)] == 2)) || \
4255 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256
Alexander Belopolsky40018472011-02-26 01:02:56 +00004257PyObject *
4258PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004259 Py_ssize_t size,
4260 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004261{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004262 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4263}
4264
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265/* The decoder. The only state we preserve is our read position,
4266 * i.e. how many characters we have consumed. So if we end in the
4267 * middle of a shift sequence we have to back off the read position
4268 * and the output to the beginning of the sequence, otherwise we lose
4269 * all the shift state (seen bits, number of bits seen, high
4270 * surrogate). */
4271
Alexander Belopolsky40018472011-02-26 01:02:56 +00004272PyObject *
4273PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004274 Py_ssize_t size,
4275 const char *errors,
4276 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004279 Py_ssize_t startinpos;
4280 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004281 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004283 const char *errmsg = "";
4284 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004285 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004286 unsigned int base64bits = 0;
4287 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004288 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 PyObject *errorHandler = NULL;
4290 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004291
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004292 if (size == 0) {
4293 if (consumed)
4294 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004295 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004296 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004299 _PyUnicodeWriter_Init(&writer);
4300 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301
4302 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004303 e = s + size;
4304
4305 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004306 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004308 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004309
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 if (inShift) { /* in a base-64 section */
4311 if (IS_BASE64(ch)) { /* consume a base-64 character */
4312 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4313 base64bits += 6;
4314 s++;
4315 if (base64bits >= 16) {
4316 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004317 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 base64bits -= 16;
4319 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004320 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 if (surrogate) {
4322 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004323 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4324 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004325 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004326 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004328 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004329 }
4330 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004331 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004332 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 }
4335 }
Victor Stinner551ac952011-11-29 22:58:13 +01004336 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 /* first surrogate */
4338 surrogate = outCh;
4339 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004341 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004342 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 }
4344 }
4345 }
4346 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347 inShift = 0;
4348 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004350 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004351 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004352 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 if (base64bits > 0) { /* left-over bits */
4355 if (base64bits >= 6) {
4356 /* We've seen at least one base-64 character */
4357 errmsg = "partial character in shift sequence";
4358 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 else {
4361 /* Some bits remain; they should be zero */
4362 if (base64buffer != 0) {
4363 errmsg = "non-zero padding bits in shift sequence";
4364 goto utf7Error;
4365 }
4366 }
4367 }
4368 if (ch != '-') {
4369 /* '-' is absorbed; other terminating
4370 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004371 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 }
4375 }
4376 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 s++; /* consume '+' */
4379 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004381 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004382 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 }
4384 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004386 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004388 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
4390 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004393 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 else {
4397 startinpos = s-starts;
4398 s++;
4399 errmsg = "unexpected special character";
4400 goto utf7Error;
4401 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004402 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004405 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 errors, &errorHandler,
4407 "utf7", errmsg,
4408 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 }
4412
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 /* end of string */
4414
4415 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4416 /* if we're in an inconsistent state, that's an error */
4417 if (surrogate ||
4418 (base64bits >= 6) ||
4419 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004421 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 errors, &errorHandler,
4423 "utf7", "unterminated shift sequence",
4424 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 goto onError;
4427 if (s < e)
4428 goto restart;
4429 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431
4432 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004433 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004435 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004436 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004437 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004438 writer.kind, writer.data, shiftOutStart);
4439 Py_XDECREF(errorHandler);
4440 Py_XDECREF(exc);
4441 _PyUnicodeWriter_Dealloc(&writer);
4442 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004443 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004444 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 }
4446 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004447 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004449 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 Py_XDECREF(errorHandler);
4452 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004453 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 Py_XDECREF(errorHandler);
4457 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 return NULL;
4460}
4461
4462
Alexander Belopolsky40018472011-02-26 01:02:56 +00004463PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004464_PyUnicode_EncodeUTF7(PyObject *str,
4465 int base64SetO,
4466 int base64WhiteSpace,
4467 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004469 int kind;
4470 void *data;
4471 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004472 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004474 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475 unsigned int base64bits = 0;
4476 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 char * out;
4478 char * start;
4479
Benjamin Petersonbac79492012-01-14 13:34:47 -05004480 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004481 return NULL;
4482 kind = PyUnicode_KIND(str);
4483 data = PyUnicode_DATA(str);
4484 len = PyUnicode_GET_LENGTH(str);
4485
4486 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004490 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004491 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004492 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 if (v == NULL)
4494 return NULL;
4495
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004496 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004497 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004498 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 if (inShift) {
4501 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4502 /* shifting out */
4503 if (base64bits) { /* output remaining bits */
4504 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4505 base64buffer = 0;
4506 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 }
4508 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 /* Characters not in the BASE64 set implicitly unshift the sequence
4510 so no '-' is required, except if the character is itself a '-' */
4511 if (IS_BASE64(ch) || ch == '-') {
4512 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 *out++ = (char) ch;
4515 }
4516 else {
4517 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004518 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 else { /* not in a shift sequence */
4521 if (ch == '+') {
4522 *out++ = '+';
4523 *out++ = '-';
4524 }
4525 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4526 *out++ = (char) ch;
4527 }
4528 else {
4529 *out++ = '+';
4530 inShift = 1;
4531 goto encode_char;
4532 }
4533 }
4534 continue;
4535encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004537 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004538
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 /* code first surrogate */
4540 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004541 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 while (base64bits >= 6) {
4543 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4544 base64bits -= 6;
4545 }
4546 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004547 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 base64bits += 16;
4550 base64buffer = (base64buffer << 16) | ch;
4551 while (base64bits >= 6) {
4552 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4553 base64bits -= 6;
4554 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004555 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556 if (base64bits)
4557 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4558 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004560 if (_PyBytes_Resize(&v, out - start) < 0)
4561 return NULL;
4562 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004564PyObject *
4565PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4566 Py_ssize_t size,
4567 int base64SetO,
4568 int base64WhiteSpace,
4569 const char *errors)
4570{
4571 PyObject *result;
4572 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4573 if (tmp == NULL)
4574 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004575 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004576 base64WhiteSpace, errors);
4577 Py_DECREF(tmp);
4578 return result;
4579}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581#undef IS_BASE64
4582#undef FROM_BASE64
4583#undef TO_BASE64
4584#undef DECODE_DIRECT
4585#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587/* --- UTF-8 Codec -------------------------------------------------------- */
4588
Alexander Belopolsky40018472011-02-26 01:02:56 +00004589PyObject *
4590PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004591 Py_ssize_t size,
4592 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593{
Walter Dörwald69652032004-09-07 20:24:22 +00004594 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4595}
4596
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004597#include "stringlib/asciilib.h"
4598#include "stringlib/codecs.h"
4599#include "stringlib/undef.h"
4600
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004601#include "stringlib/ucs1lib.h"
4602#include "stringlib/codecs.h"
4603#include "stringlib/undef.h"
4604
4605#include "stringlib/ucs2lib.h"
4606#include "stringlib/codecs.h"
4607#include "stringlib/undef.h"
4608
4609#include "stringlib/ucs4lib.h"
4610#include "stringlib/codecs.h"
4611#include "stringlib/undef.h"
4612
Antoine Pitrouab868312009-01-10 15:40:25 +00004613/* Mask to quickly check whether a C 'long' contains a
4614 non-ASCII, UTF8-encoded char. */
4615#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004616# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004617#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004618# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004619#else
4620# error C 'long' size should be either 4 or 8!
4621#endif
4622
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004623static Py_ssize_t
4624ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004625{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004626 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004627 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004628
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004629 /*
4630 * Issue #17237: m68k is a bit different from most architectures in
4631 * that objects do not use "natural alignment" - for example, int and
4632 * long are only aligned at 2-byte boundaries. Therefore the assert()
4633 * won't work; also, tests have shown that skipping the "optimised
4634 * version" will even speed up m68k.
4635 */
4636#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004637#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004638 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4639 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640 /* Fast path, see in STRINGLIB(utf8_decode) for
4641 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004642 /* Help allocation */
4643 const char *_p = p;
4644 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645 while (_p < aligned_end) {
4646 unsigned long value = *(const unsigned long *) _p;
4647 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004649 *((unsigned long *)q) = value;
4650 _p += SIZEOF_LONG;
4651 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004652 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004653 p = _p;
4654 while (p < end) {
4655 if ((unsigned char)*p & 0x80)
4656 break;
4657 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004662#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 while (p < end) {
4664 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4665 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004666 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004667 /* Help allocation */
4668 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669 while (_p < aligned_end) {
4670 unsigned long value = *(unsigned long *) _p;
4671 if (value & ASCII_CHAR_MASK)
4672 break;
4673 _p += SIZEOF_LONG;
4674 }
4675 p = _p;
4676 if (_p == end)
4677 break;
4678 }
4679 if ((unsigned char)*p & 0x80)
4680 break;
4681 ++p;
4682 }
4683 memcpy(dest, start, p - start);
4684 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685}
Antoine Pitrouab868312009-01-10 15:40:25 +00004686
Victor Stinner785938e2011-12-11 20:09:03 +01004687PyObject *
4688PyUnicode_DecodeUTF8Stateful(const char *s,
4689 Py_ssize_t size,
4690 const char *errors,
4691 Py_ssize_t *consumed)
4692{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004693 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004694 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004696
4697 Py_ssize_t startinpos;
4698 Py_ssize_t endinpos;
4699 const char *errmsg = "";
4700 PyObject *errorHandler = NULL;
4701 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004702
4703 if (size == 0) {
4704 if (consumed)
4705 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004706 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004707 }
4708
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004709 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4710 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004711 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712 *consumed = 1;
4713 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004714 }
4715
Victor Stinner8f674cc2013-04-17 23:02:17 +02004716 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004717 writer.min_length = size;
4718 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004720
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004721 writer.pos = ascii_decode(s, end, writer.data);
4722 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723 while (s < end) {
4724 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004727 if (PyUnicode_IS_ASCII(writer.buffer))
4728 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004730 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004732 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004733 } else {
4734 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004735 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 }
4737
4738 switch (ch) {
4739 case 0:
4740 if (s == end || consumed)
4741 goto End;
4742 errmsg = "unexpected end of data";
4743 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004744 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 break;
4746 case 1:
4747 errmsg = "invalid start byte";
4748 startinpos = s - starts;
4749 endinpos = startinpos + 1;
4750 break;
4751 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004752 case 3:
4753 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 errmsg = "invalid continuation byte";
4755 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004756 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 break;
4758 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004759 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 goto onError;
4761 continue;
4762 }
4763
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004764 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 errors, &errorHandler,
4766 "utf-8", errmsg,
4767 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004768 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004770 }
4771
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 if (consumed)
4774 *consumed = s - starts;
4775
4776 Py_XDECREF(errorHandler);
4777 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004778 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779
4780onError:
4781 Py_XDECREF(errorHandler);
4782 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004783 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004785}
4786
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004787#ifdef __APPLE__
4788
4789/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004790 used to decode the command line arguments on Mac OS X.
4791
4792 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004793 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004794
4795wchar_t*
4796_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4797{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 wchar_t *unicode;
4800 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004801
4802 /* Note: size will always be longer than the resulting Unicode
4803 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004804 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004805 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004806 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004807 if (!unicode)
4808 return NULL;
4809
4810 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004811 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004812 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004813 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004815#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004817#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 if (ch > 0xFF) {
4821#if SIZEOF_WCHAR_T == 4
4822 assert(0);
4823#else
4824 assert(Py_UNICODE_IS_SURROGATE(ch));
4825 /* compute and append the two surrogates: */
4826 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4827 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4828#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 else {
4831 if (!ch && s == e)
4832 break;
4833 /* surrogateescape */
4834 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4835 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 return unicode;
4839}
4840
4841#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843/* Primary internal function which creates utf8 encoded bytes objects.
4844
4845 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004846 and allocate exactly as much space needed at the end. Else allocate the
4847 maximum possible needed (4 result bytes per Unicode character), and return
4848 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004849*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004850PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004851_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852{
Victor Stinner6099a032011-12-18 14:22:26 +01004853 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004854 void *data;
4855 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857 if (!PyUnicode_Check(unicode)) {
4858 PyErr_BadArgument();
4859 return NULL;
4860 }
4861
4862 if (PyUnicode_READY(unicode) == -1)
4863 return NULL;
4864
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004865 if (PyUnicode_UTF8(unicode))
4866 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4867 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868
4869 kind = PyUnicode_KIND(unicode);
4870 data = PyUnicode_DATA(unicode);
4871 size = PyUnicode_GET_LENGTH(unicode);
4872
Benjamin Petersonead6b532011-12-20 17:23:42 -06004873 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004874 default:
4875 assert(0);
4876 case PyUnicode_1BYTE_KIND:
4877 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4878 assert(!PyUnicode_IS_ASCII(unicode));
4879 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4880 case PyUnicode_2BYTE_KIND:
4881 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4882 case PyUnicode_4BYTE_KIND:
4883 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885}
4886
Alexander Belopolsky40018472011-02-26 01:02:56 +00004887PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004888PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4889 Py_ssize_t size,
4890 const char *errors)
4891{
4892 PyObject *v, *unicode;
4893
4894 unicode = PyUnicode_FromUnicode(s, size);
4895 if (unicode == NULL)
4896 return NULL;
4897 v = _PyUnicode_AsUTF8String(unicode, errors);
4898 Py_DECREF(unicode);
4899 return v;
4900}
4901
4902PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004903PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906}
4907
Walter Dörwald41980ca2007-08-16 21:55:45 +00004908/* --- UTF-32 Codec ------------------------------------------------------- */
4909
4910PyObject *
4911PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 Py_ssize_t size,
4913 const char *errors,
4914 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004915{
4916 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4917}
4918
4919PyObject *
4920PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 Py_ssize_t size,
4922 const char *errors,
4923 int *byteorder,
4924 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004925{
4926 const char *starts = s;
4927 Py_ssize_t startinpos;
4928 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004929 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004930 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004931 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004932 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934 PyObject *errorHandler = NULL;
4935 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004936
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937 q = (unsigned char *)s;
4938 e = q + size;
4939
4940 if (byteorder)
4941 bo = *byteorder;
4942
4943 /* Check for BOM marks (U+FEFF) in the input and adjust current
4944 byte order setting accordingly. In native mode, the leading BOM
4945 mark is skipped, in all other modes, it is copied to the output
4946 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004947 if (bo == 0 && size >= 4) {
4948 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4949 if (bom == 0x0000FEFF) {
4950 bo = -1;
4951 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004953 else if (bom == 0xFFFE0000) {
4954 bo = 1;
4955 q += 4;
4956 }
4957 if (byteorder)
4958 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 }
4960
Victor Stinnere64322e2012-10-30 23:12:47 +01004961 if (q == e) {
4962 if (consumed)
4963 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004964 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965 }
4966
Victor Stinnere64322e2012-10-30 23:12:47 +01004967#ifdef WORDS_BIGENDIAN
4968 le = bo < 0;
4969#else
4970 le = bo <= 0;
4971#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004972 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004973
Victor Stinner8f674cc2013-04-17 23:02:17 +02004974 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004975 writer.min_length = (e - q + 3) / 4;
4976 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004977 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004978
Victor Stinnere64322e2012-10-30 23:12:47 +01004979 while (1) {
4980 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004982
Victor Stinnere64322e2012-10-30 23:12:47 +01004983 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 enum PyUnicode_Kind kind = writer.kind;
4985 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004988 if (le) {
4989 do {
4990 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4991 if (ch > maxch)
4992 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004993 if (kind != PyUnicode_1BYTE_KIND &&
4994 Py_UNICODE_IS_SURROGATE(ch))
4995 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004997 q += 4;
4998 } while (q <= last);
4999 }
5000 else {
5001 do {
5002 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5003 if (ch > maxch)
5004 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005005 if (kind != PyUnicode_1BYTE_KIND &&
5006 Py_UNICODE_IS_SURROGATE(ch))
5007 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005009 q += 4;
5010 } while (q <= last);
5011 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 }
5014
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005015 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005016 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005017 startinpos = ((const char *)q) - starts;
5018 endinpos = startinpos + 4;
5019 }
5020 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005021 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005023 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005025 startinpos = ((const char *)q) - starts;
5026 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005028 else {
5029 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005030 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005031 goto onError;
5032 q += 4;
5033 continue;
5034 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005035 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005036 startinpos = ((const char *)q) - starts;
5037 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005039
5040 /* The remaining input chars are ignored if the callback
5041 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005042 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005044 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005046 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048 }
5049
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 Py_XDECREF(errorHandler);
5054 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005055 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005058 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059 Py_XDECREF(errorHandler);
5060 Py_XDECREF(exc);
5061 return NULL;
5062}
5063
5064PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005065_PyUnicode_EncodeUTF32(PyObject *str,
5066 const char *errors,
5067 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005069 int kind;
5070 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005071 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005072 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005073 unsigned char *p;
5074 Py_ssize_t nsize, i;
5075 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005076#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005077 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005079 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005081 const char *encoding;
5082 PyObject *errorHandler = NULL;
5083 PyObject *exc = NULL;
5084 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085
Serhiy Storchaka30793282014-01-04 22:44:01 +02005086#define STORECHAR(CH) \
5087 do { \
5088 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5089 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5090 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5091 p[iorder[0]] = (CH) & 0xff; \
5092 p += 4; \
5093 } while(0)
5094
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005095 if (!PyUnicode_Check(str)) {
5096 PyErr_BadArgument();
5097 return NULL;
5098 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005099 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005100 return NULL;
5101 kind = PyUnicode_KIND(str);
5102 data = PyUnicode_DATA(str);
5103 len = PyUnicode_GET_LENGTH(str);
5104
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005105 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005106 if (nsize > PY_SSIZE_T_MAX / 4)
5107 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005108 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005109 if (v == NULL)
5110 return NULL;
5111
Serhiy Storchaka30793282014-01-04 22:44:01 +02005112 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005114 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005115 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005116 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117
Serhiy Storchaka30793282014-01-04 22:44:01 +02005118 if (byteorder == -1) {
5119 /* force LE */
5120 iorder[0] = 0;
5121 iorder[1] = 1;
5122 iorder[2] = 2;
5123 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005124 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005125 }
5126 else if (byteorder == 1) {
5127 /* force BE */
5128 iorder[0] = 3;
5129 iorder[1] = 2;
5130 iorder[2] = 1;
5131 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005132 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005133 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005134 else
5135 encoding = "utf-32";
5136
5137 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005138 for (i = 0; i < len; i++)
5139 STORECHAR(PyUnicode_READ(kind, data, i));
5140 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141 }
5142
Serhiy Storchaka30793282014-01-04 22:44:01 +02005143 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005144 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005145 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5146 i++;
5147 assert(ch <= MAX_UNICODE);
5148 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5149 STORECHAR(ch);
5150 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005151 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005152
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005153 rep = unicode_encode_call_errorhandler(
5154 errors, &errorHandler,
5155 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005156 str, &exc, i-1, i, &i);
5157
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005158 if (!rep)
5159 goto error;
5160
5161 if (PyBytes_Check(rep)) {
5162 repsize = PyBytes_GET_SIZE(rep);
5163 if (repsize & 3) {
5164 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005165 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005166 "surrogates not allowed");
5167 goto error;
5168 }
5169 moreunits = repsize / 4;
5170 }
5171 else {
5172 assert(PyUnicode_Check(rep));
5173 if (PyUnicode_READY(rep) < 0)
5174 goto error;
5175 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5176 if (!PyUnicode_IS_ASCII(rep)) {
5177 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005178 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005179 "surrogates not allowed");
5180 goto error;
5181 }
5182 }
5183
5184 /* four bytes are reserved for each surrogate */
5185 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005186 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005187 Py_ssize_t morebytes = 4 * (moreunits - 1);
5188 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5189 /* integer overflow */
5190 PyErr_NoMemory();
5191 goto error;
5192 }
5193 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5194 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005195 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005196 }
5197
5198 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005199 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5200 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005201 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005202 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005203 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005204 repdata = PyUnicode_1BYTE_DATA(rep);
5205 while (repsize--) {
5206 Py_UCS4 ch = *repdata++;
5207 STORECHAR(ch);
5208 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005209 }
5210
5211 Py_CLEAR(rep);
5212 }
5213
5214 /* Cut back to size actually needed. This is necessary for, for example,
5215 encoding of a string containing isolated surrogates and the 'ignore'
5216 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005217 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005218 if (nsize != PyBytes_GET_SIZE(v))
5219 _PyBytes_Resize(&v, nsize);
5220 Py_XDECREF(errorHandler);
5221 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005222 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005223 error:
5224 Py_XDECREF(rep);
5225 Py_XDECREF(errorHandler);
5226 Py_XDECREF(exc);
5227 Py_XDECREF(v);
5228 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230}
5231
Alexander Belopolsky40018472011-02-26 01:02:56 +00005232PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005233PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5234 Py_ssize_t size,
5235 const char *errors,
5236 int byteorder)
5237{
5238 PyObject *result;
5239 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5240 if (tmp == NULL)
5241 return NULL;
5242 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5243 Py_DECREF(tmp);
5244 return result;
5245}
5246
5247PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005248PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005249{
Victor Stinnerb960b342011-11-20 19:12:52 +01005250 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251}
5252
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253/* --- UTF-16 Codec ------------------------------------------------------- */
5254
Tim Peters772747b2001-08-09 22:21:55 +00005255PyObject *
5256PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 Py_ssize_t size,
5258 const char *errors,
5259 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260{
Walter Dörwald69652032004-09-07 20:24:22 +00005261 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5262}
5263
5264PyObject *
5265PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 Py_ssize_t size,
5267 const char *errors,
5268 int *byteorder,
5269 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005272 Py_ssize_t startinpos;
5273 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005274 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005275 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005276 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005277 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005278 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 PyObject *errorHandler = NULL;
5280 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005281 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282
Tim Peters772747b2001-08-09 22:21:55 +00005283 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005284 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
5286 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005287 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005289 /* Check for BOM marks (U+FEFF) in the input and adjust current
5290 byte order setting accordingly. In native mode, the leading BOM
5291 mark is skipped, in all other modes, it is copied to the output
5292 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005293 if (bo == 0 && size >= 2) {
5294 const Py_UCS4 bom = (q[1] << 8) | q[0];
5295 if (bom == 0xFEFF) {
5296 q += 2;
5297 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005299 else if (bom == 0xFFFE) {
5300 q += 2;
5301 bo = 1;
5302 }
5303 if (byteorder)
5304 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306
Antoine Pitrou63065d72012-05-15 23:48:04 +02005307 if (q == e) {
5308 if (consumed)
5309 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005310 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005311 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005312
Christian Heimes743e0cd2012-10-17 23:52:17 +02005313#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005314 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005316#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005317 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005318 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005319#endif
Tim Peters772747b2001-08-09 22:21:55 +00005320
Antoine Pitrou63065d72012-05-15 23:48:04 +02005321 /* Note: size will always be longer than the resulting Unicode
5322 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005323 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005324 writer.min_length = (e - q + 1) / 2;
5325 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005326 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005327
Antoine Pitrou63065d72012-05-15 23:48:04 +02005328 while (1) {
5329 Py_UCS4 ch = 0;
5330 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005331 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005333 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005334 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005335 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 native_ordering);
5337 else
5338 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005339 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005340 native_ordering);
5341 } else if (kind == PyUnicode_2BYTE_KIND) {
5342 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005343 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005344 native_ordering);
5345 } else {
5346 assert(kind == PyUnicode_4BYTE_KIND);
5347 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005348 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005349 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005350 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005351 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 switch (ch)
5354 {
5355 case 0:
5356 /* remaining byte at the end? (size should be even) */
5357 if (q == e || consumed)
5358 goto End;
5359 errmsg = "truncated data";
5360 startinpos = ((const char *)q) - starts;
5361 endinpos = ((const char *)e) - starts;
5362 break;
5363 /* The remaining input chars are ignored if the callback
5364 chooses to skip the input */
5365 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005366 q -= 2;
5367 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005368 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005369 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005370 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005371 endinpos = ((const char *)e) - starts;
5372 break;
5373 case 2:
5374 errmsg = "illegal encoding";
5375 startinpos = ((const char *)q) - 2 - starts;
5376 endinpos = startinpos + 2;
5377 break;
5378 case 3:
5379 errmsg = "illegal UTF-16 surrogate";
5380 startinpos = ((const char *)q) - 4 - starts;
5381 endinpos = startinpos + 2;
5382 break;
5383 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005384 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005385 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 continue;
5387 }
5388
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005389 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005390 errors,
5391 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005392 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005393 &starts,
5394 (const char **)&e,
5395 &startinpos,
5396 &endinpos,
5397 &exc,
5398 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005399 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 }
5402
Antoine Pitrou63065d72012-05-15 23:48:04 +02005403End:
Walter Dörwald69652032004-09-07 20:24:22 +00005404 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005407 Py_XDECREF(errorHandler);
5408 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005409 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005412 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413 Py_XDECREF(errorHandler);
5414 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 return NULL;
5416}
5417
Tim Peters772747b2001-08-09 22:21:55 +00005418PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419_PyUnicode_EncodeUTF16(PyObject *str,
5420 const char *errors,
5421 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005423 enum PyUnicode_Kind kind;
5424 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005426 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005427 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005428 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005429#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005430 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005431#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005433#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 const char *encoding;
5435 Py_ssize_t nsize, pos;
5436 PyObject *errorHandler = NULL;
5437 PyObject *exc = NULL;
5438 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005439
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005440 if (!PyUnicode_Check(str)) {
5441 PyErr_BadArgument();
5442 return NULL;
5443 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005444 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005445 return NULL;
5446 kind = PyUnicode_KIND(str);
5447 data = PyUnicode_DATA(str);
5448 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005449
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005451 if (kind == PyUnicode_4BYTE_KIND) {
5452 const Py_UCS4 *in = (const Py_UCS4 *)data;
5453 const Py_UCS4 *end = in + len;
5454 while (in < end)
5455 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005456 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 }
5458 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 nsize = len + pairs + (byteorder == 0);
5461 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 if (v == NULL)
5463 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005465 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005466 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005467 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005469 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005471 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005472
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 if (kind == PyUnicode_1BYTE_KIND) {
5474 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5475 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005476 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005477
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 if (byteorder < 0)
5479 encoding = "utf-16-le";
5480 else if (byteorder > 0)
5481 encoding = "utf-16-be";
5482 else
5483 encoding = "utf-16";
5484
5485 pos = 0;
5486 while (pos < len) {
5487 Py_ssize_t repsize, moreunits;
5488
5489 if (kind == PyUnicode_2BYTE_KIND) {
5490 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5491 &out, native_ordering);
5492 }
5493 else {
5494 assert(kind == PyUnicode_4BYTE_KIND);
5495 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5496 &out, native_ordering);
5497 }
5498 if (pos == len)
5499 break;
5500
5501 rep = unicode_encode_call_errorhandler(
5502 errors, &errorHandler,
5503 encoding, "surrogates not allowed",
5504 str, &exc, pos, pos + 1, &pos);
5505 if (!rep)
5506 goto error;
5507
5508 if (PyBytes_Check(rep)) {
5509 repsize = PyBytes_GET_SIZE(rep);
5510 if (repsize & 1) {
5511 raise_encode_exception(&exc, encoding,
5512 str, pos - 1, pos,
5513 "surrogates not allowed");
5514 goto error;
5515 }
5516 moreunits = repsize / 2;
5517 }
5518 else {
5519 assert(PyUnicode_Check(rep));
5520 if (PyUnicode_READY(rep) < 0)
5521 goto error;
5522 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5523 if (!PyUnicode_IS_ASCII(rep)) {
5524 raise_encode_exception(&exc, encoding,
5525 str, pos - 1, pos,
5526 "surrogates not allowed");
5527 goto error;
5528 }
5529 }
5530
5531 /* two bytes are reserved for each surrogate */
5532 if (moreunits > 1) {
5533 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5534 Py_ssize_t morebytes = 2 * (moreunits - 1);
5535 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5536 /* integer overflow */
5537 PyErr_NoMemory();
5538 goto error;
5539 }
5540 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5541 goto error;
5542 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5543 }
5544
5545 if (PyBytes_Check(rep)) {
5546 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5547 out += moreunits;
5548 } else /* rep is unicode */ {
5549 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5550 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5551 &out, native_ordering);
5552 }
5553
5554 Py_CLEAR(rep);
5555 }
5556
5557 /* Cut back to size actually needed. This is necessary for, for example,
5558 encoding of a string containing isolated surrogates and the 'ignore' handler
5559 is used. */
5560 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5561 if (nsize != PyBytes_GET_SIZE(v))
5562 _PyBytes_Resize(&v, nsize);
5563 Py_XDECREF(errorHandler);
5564 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005565 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005566 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005567 error:
5568 Py_XDECREF(rep);
5569 Py_XDECREF(errorHandler);
5570 Py_XDECREF(exc);
5571 Py_XDECREF(v);
5572 return NULL;
5573#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574}
5575
Alexander Belopolsky40018472011-02-26 01:02:56 +00005576PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005577PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5578 Py_ssize_t size,
5579 const char *errors,
5580 int byteorder)
5581{
5582 PyObject *result;
5583 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5584 if (tmp == NULL)
5585 return NULL;
5586 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5587 Py_DECREF(tmp);
5588 return result;
5589}
5590
5591PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005592PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005594 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595}
5596
5597/* --- Unicode Escape Codec ----------------------------------------------- */
5598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005599/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5600 if all the escapes in the string make it still a valid ASCII string.
5601 Returns -1 if any escapes were found which cause the string to
5602 pop out of ASCII range. Otherwise returns the length of the
5603 required buffer to hold the string.
5604 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005605static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005606length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5607{
5608 const unsigned char *p = (const unsigned char *)s;
5609 const unsigned char *end = p + size;
5610 Py_ssize_t length = 0;
5611
5612 if (size < 0)
5613 return -1;
5614
5615 for (; p < end; ++p) {
5616 if (*p > 127) {
5617 /* Non-ASCII */
5618 return -1;
5619 }
5620 else if (*p != '\\') {
5621 /* Normal character */
5622 ++length;
5623 }
5624 else {
5625 /* Backslash-escape, check next char */
5626 ++p;
5627 /* Escape sequence reaches till end of string or
5628 non-ASCII follow-up. */
5629 if (p >= end || *p > 127)
5630 return -1;
5631 switch (*p) {
5632 case '\n':
5633 /* backslash + \n result in zero characters */
5634 break;
5635 case '\\': case '\'': case '\"':
5636 case 'b': case 'f': case 't':
5637 case 'n': case 'r': case 'v': case 'a':
5638 ++length;
5639 break;
5640 case '0': case '1': case '2': case '3':
5641 case '4': case '5': case '6': case '7':
5642 case 'x': case 'u': case 'U': case 'N':
5643 /* these do not guarantee ASCII characters */
5644 return -1;
5645 default:
5646 /* count the backslash + the other character */
5647 length += 2;
5648 }
5649 }
5650 }
5651 return length;
5652}
5653
Fredrik Lundh06d12682001-01-24 07:59:11 +00005654static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005655
Alexander Belopolsky40018472011-02-26 01:02:56 +00005656PyObject *
5657PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005658 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005659 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005662 Py_ssize_t startinpos;
5663 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005664 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005666 char* message;
5667 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 PyObject *errorHandler = NULL;
5669 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005670 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005671
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005672 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005673 if (len == 0)
5674 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675
5676 /* After length_of_escaped_ascii_string() there are two alternatives,
5677 either the string is pure ASCII with named escapes like \n, etc.
5678 and we determined it's exact size (common case)
5679 or it contains \x, \u, ... escape sequences. then we create a
5680 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005681 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005682 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005683 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684 }
5685 else {
5686 /* Escaped strings will always be longer than the resulting
5687 Unicode string, so we start with size here and then reduce the
5688 length after conversion to the true value.
5689 (but if the error callback returns a long replacement string
5690 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005691 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005692 }
5693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005695 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005697
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 while (s < end) {
5699 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005700 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
5703 /* Non-escape characters are interpreted as Unicode ordinals */
5704 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005705 x = (unsigned char)*s;
5706 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005707 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005708 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 continue;
5710 }
5711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 /* \ - Escapes */
5714 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005715 c = *s++;
5716 if (s > end)
5717 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005719 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005722#define WRITECHAR(ch) \
5723 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005724 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005725 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005726 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005729 case '\\': WRITECHAR('\\'); break;
5730 case '\'': WRITECHAR('\''); break;
5731 case '\"': WRITECHAR('\"'); break;
5732 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005733 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005734 case 'f': WRITECHAR('\014'); break;
5735 case 't': WRITECHAR('\t'); break;
5736 case 'n': WRITECHAR('\n'); break;
5737 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005738 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005739 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005741 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 case '0': case '1': case '2': case '3':
5745 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005746 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005747 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005748 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005749 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005750 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 break;
5754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* hex escapes */
5756 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758 digits = 2;
5759 message = "truncated \\xXX escape";
5760 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 digits = 4;
5765 message = "truncated \\uXXXX escape";
5766 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005770 digits = 8;
5771 message = "truncated \\UXXXXXXXX escape";
5772 hexescape:
5773 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005774 if (end - s < digits) {
5775 /* count only hex digits */
5776 for (; s < end; ++s) {
5777 c = (unsigned char)*s;
5778 if (!Py_ISXDIGIT(c))
5779 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005780 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005781 goto error;
5782 }
5783 for (; digits--; ++s) {
5784 c = (unsigned char)*s;
5785 if (!Py_ISXDIGIT(c))
5786 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005787 chr = (chr<<4) & ~0xF;
5788 if (c >= '0' && c <= '9')
5789 chr += c - '0';
5790 else if (c >= 'a' && c <= 'f')
5791 chr += 10 + c - 'a';
5792 else
5793 chr += 10 + c - 'A';
5794 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005795 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 /* _decoding_error will have already written into the
5797 target buffer. */
5798 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005799 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005800 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005801 message = "illegal Unicode character";
5802 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005803 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005804 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005805 break;
5806
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005808 case 'N':
5809 message = "malformed \\N character escape";
5810 if (ucnhash_CAPI == NULL) {
5811 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005812 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5813 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005814 if (ucnhash_CAPI == NULL)
5815 goto ucnhashError;
5816 }
5817 if (*s == '{') {
5818 const char *start = s+1;
5819 /* look for the closing brace */
5820 while (*s != '}' && s < end)
5821 s++;
5822 if (s > start && s < end && *s == '}') {
5823 /* found a name. look it up in the unicode database */
5824 message = "unknown Unicode character name";
5825 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005826 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005827 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005828 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 goto store;
5830 }
5831 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005832 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833
5834 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005835 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 message = "\\ at end of string";
5837 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005838 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005839 }
5840 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005842 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005843 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005844 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005846 continue;
5847
5848 error:
5849 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005850 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005851 errors, &errorHandler,
5852 "unicodeescape", message,
5853 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005854 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005855 goto onError;
5856 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005858#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005859
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005860 Py_XDECREF(errorHandler);
5861 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005862 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005863
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005865 PyErr_SetString(
5866 PyExc_UnicodeError,
5867 "\\N escapes not supported (can't load unicodedata module)"
5868 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005869 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 Py_XDECREF(errorHandler);
5871 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005872 return NULL;
5873
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005875 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 return NULL;
5879}
5880
5881/* Return a Unicode-Escape string version of the Unicode object.
5882
5883 If quotes is true, the string is enclosed in u"" or u'' quotes as
5884 appropriate.
5885
5886*/
5887
Alexander Belopolsky40018472011-02-26 01:02:56 +00005888PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005892 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 int kind;
5895 void *data;
5896 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897
Ezio Melottie7f90372012-10-05 03:33:31 +03005898 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005899 escape.
5900
Ezio Melottie7f90372012-10-05 03:33:31 +03005901 For UCS1 strings it's '\xxx', 4 bytes per source character.
5902 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5903 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005904 */
5905
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 if (!PyUnicode_Check(unicode)) {
5907 PyErr_BadArgument();
5908 return NULL;
5909 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005910 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 return NULL;
5912 len = PyUnicode_GET_LENGTH(unicode);
5913 kind = PyUnicode_KIND(unicode);
5914 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005915 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5917 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5918 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5919 }
5920
5921 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005922 return PyBytes_FromStringAndSize(NULL, 0);
5923
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005924 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005926
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005927 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005929 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 if (repr == NULL)
5932 return NULL;
5933
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005934 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005937 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005938
Walter Dörwald79e913e2007-05-12 11:08:06 +00005939 /* Escape backslashes */
5940 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 *p++ = '\\';
5942 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005943 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005944 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005945
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005946 /* Map 21-bit characters to '\U00xxxxxx' */
5947 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005948 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005949 *p++ = '\\';
5950 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005951 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5952 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5953 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5954 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5955 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5956 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5957 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5958 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005960 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005963 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 *p++ = '\\';
5965 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005966 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5967 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5968 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5969 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005971
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005972 /* Map special whitespace to '\t', \n', '\r' */
5973 else if (ch == '\t') {
5974 *p++ = '\\';
5975 *p++ = 't';
5976 }
5977 else if (ch == '\n') {
5978 *p++ = '\\';
5979 *p++ = 'n';
5980 }
5981 else if (ch == '\r') {
5982 *p++ = '\\';
5983 *p++ = 'r';
5984 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005985
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005986 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005987 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005989 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005990 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5991 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005992 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 /* Copy everything else as-is */
5995 else
5996 *p++ = (char) ch;
5997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005999 assert(p - PyBytes_AS_STRING(repr) > 0);
6000 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6001 return NULL;
6002 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003}
6004
Alexander Belopolsky40018472011-02-26 01:02:56 +00006005PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006006PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6007 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 PyObject *result;
6010 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6011 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013 result = PyUnicode_AsUnicodeEscapeString(tmp);
6014 Py_DECREF(tmp);
6015 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
6018/* --- Raw Unicode Escape Codec ------------------------------------------- */
6019
Alexander Belopolsky40018472011-02-26 01:02:56 +00006020PyObject *
6021PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006022 Py_ssize_t size,
6023 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006025 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 Py_ssize_t startinpos;
6027 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006028 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 const char *end;
6030 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 PyObject *errorHandler = NULL;
6032 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006033
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006034 if (size == 0)
6035 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 /* Escaped strings will always be longer than the resulting
6038 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 length after conversion to the true value. (But decoding error
6040 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006041 _PyUnicodeWriter_Init(&writer);
6042 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006043
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 end = s + size;
6045 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 unsigned char c;
6047 Py_UCS4 x;
6048 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006049 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 /* Non-escape characters are interpreted as Unicode ordinals */
6052 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006054 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006055 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 startinpos = s-starts;
6059
6060 /* \u-escapes are only interpreted iff the number of leading
6061 backslashes if odd */
6062 bs = s;
6063 for (;s < end;) {
6064 if (*s != '\\')
6065 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006066 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006067 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006068 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 }
6070 if (((s - bs) & 1) == 0 ||
6071 s >= end ||
6072 (*s != 'u' && *s != 'U')) {
6073 continue;
6074 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006075 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 count = *s=='u' ? 4 : 8;
6077 s++;
6078
6079 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 for (x = 0, i = 0; i < count; ++i, ++s) {
6081 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006082 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006084 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 errors, &errorHandler,
6086 "rawunicodeescape", "truncated \\uXXXX",
6087 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 goto onError;
6090 goto nextByte;
6091 }
6092 x = (x<<4) & ~0xF;
6093 if (c >= '0' && c <= '9')
6094 x += c - '0';
6095 else if (c >= 'a' && c <= 'f')
6096 x += 10 + c - 'a';
6097 else
6098 x += 10 + c - 'A';
6099 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006100 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006101 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006102 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006103 }
6104 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006105 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006106 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006107 errors, &errorHandler,
6108 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006110 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 nextByte:
6114 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 Py_XDECREF(errorHandler);
6117 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006119
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006121 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 Py_XDECREF(errorHandler);
6123 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 return NULL;
6125}
6126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127
Alexander Belopolsky40018472011-02-26 01:02:56 +00006128PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006131 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 char *p;
6133 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006134 Py_ssize_t expandsize, pos;
6135 int kind;
6136 void *data;
6137 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139 if (!PyUnicode_Check(unicode)) {
6140 PyErr_BadArgument();
6141 return NULL;
6142 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006143 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 return NULL;
6145 kind = PyUnicode_KIND(unicode);
6146 data = PyUnicode_DATA(unicode);
6147 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006148 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6149 bytes, and 1 byte characters 4. */
6150 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006154
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 if (repr == NULL)
6157 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006159 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006161 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 for (pos = 0; pos < len; pos++) {
6163 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* Map 32-bit characters to '\Uxxxxxxxx' */
6165 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006166 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006167 *p++ = '\\';
6168 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006169 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6170 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6171 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6172 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6173 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6174 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6175 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6176 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006177 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 *p++ = '\\';
6181 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006182 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6183 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6184 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6185 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 /* Copy everything else as-is */
6188 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 *p++ = (char) ch;
6190 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006191
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192 assert(p > q);
6193 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006194 return NULL;
6195 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196}
6197
Alexander Belopolsky40018472011-02-26 01:02:56 +00006198PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006199PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6200 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202 PyObject *result;
6203 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6204 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006205 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6207 Py_DECREF(tmp);
6208 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209}
6210
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006211/* --- Unicode Internal Codec ------------------------------------------- */
6212
Alexander Belopolsky40018472011-02-26 01:02:56 +00006213PyObject *
6214_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006215 Py_ssize_t size,
6216 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006217{
6218 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006219 Py_ssize_t startinpos;
6220 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006221 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006222 const char *end;
6223 const char *reason;
6224 PyObject *errorHandler = NULL;
6225 PyObject *exc = NULL;
6226
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006227 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006228 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006229 1))
6230 return NULL;
6231
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006232 if (size == 0)
6233 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006234
Victor Stinner8f674cc2013-04-17 23:02:17 +02006235 _PyUnicodeWriter_Init(&writer);
6236 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6237 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006239 }
6240 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006241
Victor Stinner8f674cc2013-04-17 23:02:17 +02006242 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006243 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006244 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006245 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006246 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006247 endinpos = end-starts;
6248 reason = "truncated input";
6249 goto error;
6250 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006251 /* We copy the raw representation one byte at a time because the
6252 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006253 ((char *) &uch)[0] = s[0];
6254 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006255#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006256 ((char *) &uch)[2] = s[2];
6257 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006258#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006259 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006260#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006261 /* We have to sanity check the raw data, otherwise doom looms for
6262 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006263 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006264 endinpos = s - starts + Py_UNICODE_SIZE;
6265 reason = "illegal code point (> 0x10FFFF)";
6266 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006267 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006268#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006269 s += Py_UNICODE_SIZE;
6270#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006271 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006272 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006273 Py_UNICODE uch2;
6274 ((char *) &uch2)[0] = s[0];
6275 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006276 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006277 {
Victor Stinner551ac952011-11-29 22:58:13 +01006278 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006279 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006280 }
6281 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006282#endif
6283
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006284 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006285 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006286 continue;
6287
6288 error:
6289 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006290 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006291 errors, &errorHandler,
6292 "unicode_internal", reason,
6293 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006294 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006295 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006296 }
6297
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006298 Py_XDECREF(errorHandler);
6299 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006300 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006301
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006303 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006304 Py_XDECREF(errorHandler);
6305 Py_XDECREF(exc);
6306 return NULL;
6307}
6308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309/* --- Latin-1 Codec ------------------------------------------------------ */
6310
Alexander Belopolsky40018472011-02-26 01:02:56 +00006311PyObject *
6312PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006313 Py_ssize_t size,
6314 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006317 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318}
6319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006320/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006321static void
6322make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006323 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006324 PyObject *unicode,
6325 Py_ssize_t startpos, Py_ssize_t endpos,
6326 const char *reason)
6327{
6328 if (*exceptionObject == NULL) {
6329 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006331 encoding, unicode, startpos, endpos, reason);
6332 }
6333 else {
6334 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6335 goto onError;
6336 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6337 goto onError;
6338 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6339 goto onError;
6340 return;
6341 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006342 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006343 }
6344}
6345
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006347static void
6348raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006349 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006350 PyObject *unicode,
6351 Py_ssize_t startpos, Py_ssize_t endpos,
6352 const char *reason)
6353{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006354 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006355 encoding, unicode, startpos, endpos, reason);
6356 if (*exceptionObject != NULL)
6357 PyCodec_StrictErrors(*exceptionObject);
6358}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359
6360/* error handling callback helper:
6361 build arguments, call the callback and check the arguments,
6362 put the result into newpos and return the replacement string, which
6363 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006364static PyObject *
6365unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006366 PyObject **errorHandler,
6367 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006369 Py_ssize_t startpos, Py_ssize_t endpos,
6370 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006372 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374 PyObject *restuple;
6375 PyObject *resunicode;
6376
6377 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 }
6382
Benjamin Petersonbac79492012-01-14 13:34:47 -05006383 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006384 return NULL;
6385 len = PyUnicode_GET_LENGTH(unicode);
6386
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006387 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391
6392 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 Py_DECREF(restuple);
6399 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006401 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 &resunicode, newpos)) {
6403 Py_DECREF(restuple);
6404 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006406 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6407 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6408 Py_DECREF(restuple);
6409 return NULL;
6410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 *newpos = len + *newpos;
6413 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006414 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 Py_DECREF(restuple);
6416 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 Py_INCREF(resunicode);
6419 Py_DECREF(restuple);
6420 return resunicode;
6421}
6422
Alexander Belopolsky40018472011-02-26 01:02:56 +00006423static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006424unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006425 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006426 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006428 /* input state */
6429 Py_ssize_t pos=0, size;
6430 int kind;
6431 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 /* output object */
6433 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 /* pointer into the output */
6435 char *str;
6436 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006437 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006438 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6439 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 PyObject *errorHandler = NULL;
6441 PyObject *exc = NULL;
6442 /* the following variable is used for caching string comparisons
6443 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6444 int known_errorHandler = -1;
6445
Benjamin Petersonbac79492012-01-14 13:34:47 -05006446 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 return NULL;
6448 size = PyUnicode_GET_LENGTH(unicode);
6449 kind = PyUnicode_KIND(unicode);
6450 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 /* allocate enough for a simple encoding without
6452 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006453 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006454 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006455 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006457 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006458 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 ressize = size;
6460
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 while (pos < size) {
6462 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* can we encode this? */
6465 if (c<limit) {
6466 /* no overflow check, because we know that the space is enough */
6467 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006468 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006469 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 Py_ssize_t requiredsize;
6472 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 Py_ssize_t collstart = pos;
6476 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006478 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 ++collend;
6480 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6481 if (known_errorHandler==-1) {
6482 if ((errors==NULL) || (!strcmp(errors, "strict")))
6483 known_errorHandler = 1;
6484 else if (!strcmp(errors, "replace"))
6485 known_errorHandler = 2;
6486 else if (!strcmp(errors, "ignore"))
6487 known_errorHandler = 3;
6488 else if (!strcmp(errors, "xmlcharrefreplace"))
6489 known_errorHandler = 4;
6490 else
6491 known_errorHandler = 0;
6492 }
6493 switch (known_errorHandler) {
6494 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006495 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 goto onError;
6497 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006498 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 *str++ = '?'; /* fall through */
6500 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 break;
6503 case 4: /* xmlcharrefreplace */
6504 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006505 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006507 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006509 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006511 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006513 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006515 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006517 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006519 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006521 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006522 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006523 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006524 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006525 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006526 if (requiredsize > PY_SSIZE_T_MAX - incr)
6527 goto overflow;
6528 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006530 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6531 goto overflow;
6532 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006534 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 requiredsize = 2*ressize;
6536 if (_PyBytes_Resize(&res, requiredsize))
6537 goto onError;
6538 str = PyBytes_AS_STRING(res) + respos;
6539 ressize = requiredsize;
6540 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 /* generate replacement */
6542 for (i = collstart; i < collend; ++i) {
6543 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 break;
6547 default:
6548 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 encoding, reason, unicode, &exc,
6550 collstart, collend, &newpos);
6551 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006552 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006554 if (PyBytes_Check(repunicode)) {
6555 /* Directly copy bytes result to output. */
6556 repsize = PyBytes_Size(repunicode);
6557 if (repsize > 1) {
6558 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006559 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006560 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6561 Py_DECREF(repunicode);
6562 goto overflow;
6563 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006564 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6565 Py_DECREF(repunicode);
6566 goto onError;
6567 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006568 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006569 ressize += repsize-1;
6570 }
6571 memcpy(str, PyBytes_AsString(repunicode), repsize);
6572 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006573 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006574 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006575 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 /* need more space? (at least enough for what we
6578 have+the replacement+the rest of the string, so
6579 we won't have to check space for encodable characters) */
6580 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006581 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006582 requiredsize = respos;
6583 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6584 goto overflow;
6585 requiredsize += repsize;
6586 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6587 goto overflow;
6588 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006590 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 requiredsize = 2*ressize;
6592 if (_PyBytes_Resize(&res, requiredsize)) {
6593 Py_DECREF(repunicode);
6594 goto onError;
6595 }
6596 str = PyBytes_AS_STRING(res) + respos;
6597 ressize = requiredsize;
6598 }
6599 /* check if there is anything unencodable in the replacement
6600 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006601 for (i = 0; repsize-->0; ++i, ++str) {
6602 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006604 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 Py_DECREF(repunicode);
6607 goto onError;
6608 }
6609 *str = (char)c;
6610 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006611 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006612 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006613 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 }
6615 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006616 /* Resize if we allocated to much */
6617 size = str - PyBytes_AS_STRING(res);
6618 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006619 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006620 if (_PyBytes_Resize(&res, size) < 0)
6621 goto onError;
6622 }
6623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624 Py_XDECREF(errorHandler);
6625 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006626 return res;
6627
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006628 overflow:
6629 PyErr_SetString(PyExc_OverflowError,
6630 "encoded result is too long for a Python string");
6631
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006632 onError:
6633 Py_XDECREF(res);
6634 Py_XDECREF(errorHandler);
6635 Py_XDECREF(exc);
6636 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637}
6638
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006639/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640PyObject *
6641PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 Py_ssize_t size,
6643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 PyObject *result;
6646 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6647 if (unicode == NULL)
6648 return NULL;
6649 result = unicode_encode_ucs1(unicode, errors, 256);
6650 Py_DECREF(unicode);
6651 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652}
6653
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006655_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656{
6657 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 PyErr_BadArgument();
6659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006661 if (PyUnicode_READY(unicode) == -1)
6662 return NULL;
6663 /* Fast path: if it is a one-byte string, construct
6664 bytes object directly. */
6665 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6666 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6667 PyUnicode_GET_LENGTH(unicode));
6668 /* Non-Latin-1 characters present. Defer to above function to
6669 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671}
6672
6673PyObject*
6674PyUnicode_AsLatin1String(PyObject *unicode)
6675{
6676 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677}
6678
6679/* --- 7-bit ASCII Codec -------------------------------------------------- */
6680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681PyObject *
6682PyUnicode_DecodeASCII(const char *s,
6683 Py_ssize_t size,
6684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006687 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006688 int kind;
6689 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006690 Py_ssize_t startinpos;
6691 Py_ssize_t endinpos;
6692 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 const char *e;
6694 PyObject *errorHandler = NULL;
6695 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006696
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006698 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006699
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006701 if (size == 1 && (unsigned char)s[0] < 128)
6702 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006703
Victor Stinner8f674cc2013-04-17 23:02:17 +02006704 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006705 writer.min_length = size;
6706 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006707 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006710 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006711 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006712 writer.pos = outpos;
6713 if (writer.pos == size)
6714 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 s += writer.pos;
6717 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006719 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006721 PyUnicode_WRITE(kind, data, writer.pos, c);
6722 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 ++s;
6724 }
6725 else {
6726 startinpos = s-starts;
6727 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006728 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 errors, &errorHandler,
6730 "ascii", "ordinal not in range(128)",
6731 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006734 kind = writer.kind;
6735 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 Py_XDECREF(errorHandler);
6739 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006740 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006741
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006743 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 Py_XDECREF(errorHandler);
6745 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 return NULL;
6747}
6748
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006749/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006750PyObject *
6751PyUnicode_EncodeASCII(const Py_UNICODE *p,
6752 Py_ssize_t size,
6753 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755 PyObject *result;
6756 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6757 if (unicode == NULL)
6758 return NULL;
6759 result = unicode_encode_ucs1(unicode, errors, 128);
6760 Py_DECREF(unicode);
6761 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762}
6763
Alexander Belopolsky40018472011-02-26 01:02:56 +00006764PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006765_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766{
6767 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 PyErr_BadArgument();
6769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006771 if (PyUnicode_READY(unicode) == -1)
6772 return NULL;
6773 /* Fast path: if it is an ASCII-only string, construct bytes object
6774 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006775 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006776 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6777 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779}
6780
6781PyObject *
6782PyUnicode_AsASCIIString(PyObject *unicode)
6783{
6784 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785}
6786
Victor Stinner99b95382011-07-04 14:23:54 +02006787#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006788
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006789/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006790
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006791#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006792#define NEED_RETRY
6793#endif
6794
Victor Stinner3a50e702011-10-18 21:21:00 +02006795#ifndef WC_ERR_INVALID_CHARS
6796# define WC_ERR_INVALID_CHARS 0x0080
6797#endif
6798
6799static char*
6800code_page_name(UINT code_page, PyObject **obj)
6801{
6802 *obj = NULL;
6803 if (code_page == CP_ACP)
6804 return "mbcs";
6805 if (code_page == CP_UTF7)
6806 return "CP_UTF7";
6807 if (code_page == CP_UTF8)
6808 return "CP_UTF8";
6809
6810 *obj = PyBytes_FromFormat("cp%u", code_page);
6811 if (*obj == NULL)
6812 return NULL;
6813 return PyBytes_AS_STRING(*obj);
6814}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815
Victor Stinner3a50e702011-10-18 21:21:00 +02006816static DWORD
6817decode_code_page_flags(UINT code_page)
6818{
6819 if (code_page == CP_UTF7) {
6820 /* The CP_UTF7 decoder only supports flags=0 */
6821 return 0;
6822 }
6823 else
6824 return MB_ERR_INVALID_CHARS;
6825}
6826
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 * Decode a byte string from a Windows code page into unicode object in strict
6829 * mode.
6830 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006831 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6832 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006833 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006835decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006836 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006837 const char *in,
6838 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839{
Victor Stinner3a50e702011-10-18 21:21:00 +02006840 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006841 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006842 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843
6844 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006845 assert(insize > 0);
6846 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6847 if (outsize <= 0)
6848 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849
6850 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006852 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006853 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 if (*v == NULL)
6855 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006856 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857 }
6858 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006861 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006863 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864 }
6865
6866 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6868 if (outsize <= 0)
6869 goto error;
6870 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006871
Victor Stinner3a50e702011-10-18 21:21:00 +02006872error:
6873 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6874 return -2;
6875 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006876 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877}
6878
Victor Stinner3a50e702011-10-18 21:21:00 +02006879/*
6880 * Decode a byte string from a code page into unicode object with an error
6881 * handler.
6882 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006883 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006884 * UnicodeDecodeError exception and returns -1 on error.
6885 */
6886static int
6887decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006888 PyObject **v,
6889 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006890 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006891{
6892 const char *startin = in;
6893 const char *endin = in + size;
6894 const DWORD flags = decode_code_page_flags(code_page);
6895 /* Ideally, we should get reason from FormatMessage. This is the Windows
6896 2000 English version of the message. */
6897 const char *reason = "No mapping for the Unicode character exists "
6898 "in the target code page.";
6899 /* each step cannot decode more than 1 character, but a character can be
6900 represented as a surrogate pair */
6901 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006902 int insize;
6903 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006904 PyObject *errorHandler = NULL;
6905 PyObject *exc = NULL;
6906 PyObject *encoding_obj = NULL;
6907 char *encoding;
6908 DWORD err;
6909 int ret = -1;
6910
6911 assert(size > 0);
6912
6913 encoding = code_page_name(code_page, &encoding_obj);
6914 if (encoding == NULL)
6915 return -1;
6916
Victor Stinner7d00cc12014-03-17 23:08:06 +01006917 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006918 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6919 UnicodeDecodeError. */
6920 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6921 if (exc != NULL) {
6922 PyCodec_StrictErrors(exc);
6923 Py_CLEAR(exc);
6924 }
6925 goto error;
6926 }
6927
6928 if (*v == NULL) {
6929 /* Create unicode object */
6930 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6931 PyErr_NoMemory();
6932 goto error;
6933 }
Victor Stinnerab595942011-12-17 04:59:06 +01006934 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006935 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006936 if (*v == NULL)
6937 goto error;
6938 startout = PyUnicode_AS_UNICODE(*v);
6939 }
6940 else {
6941 /* Extend unicode object */
6942 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6943 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6944 PyErr_NoMemory();
6945 goto error;
6946 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006947 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 goto error;
6949 startout = PyUnicode_AS_UNICODE(*v) + n;
6950 }
6951
6952 /* Decode the byte string character per character */
6953 out = startout;
6954 while (in < endin)
6955 {
6956 /* Decode a character */
6957 insize = 1;
6958 do
6959 {
6960 outsize = MultiByteToWideChar(code_page, flags,
6961 in, insize,
6962 buffer, Py_ARRAY_LENGTH(buffer));
6963 if (outsize > 0)
6964 break;
6965 err = GetLastError();
6966 if (err != ERROR_NO_UNICODE_TRANSLATION
6967 && err != ERROR_INSUFFICIENT_BUFFER)
6968 {
6969 PyErr_SetFromWindowsErr(0);
6970 goto error;
6971 }
6972 insize++;
6973 }
6974 /* 4=maximum length of a UTF-8 sequence */
6975 while (insize <= 4 && (in + insize) <= endin);
6976
6977 if (outsize <= 0) {
6978 Py_ssize_t startinpos, endinpos, outpos;
6979
Victor Stinner7d00cc12014-03-17 23:08:06 +01006980 /* last character in partial decode? */
6981 if (in + insize >= endin && !final)
6982 break;
6983
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 startinpos = in - startin;
6985 endinpos = startinpos + 1;
6986 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006987 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 errors, &errorHandler,
6989 encoding, reason,
6990 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006991 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 {
6993 goto error;
6994 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006995 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 }
6997 else {
6998 in += insize;
6999 memcpy(out, buffer, outsize * sizeof(wchar_t));
7000 out += outsize;
7001 }
7002 }
7003
7004 /* write a NUL character at the end */
7005 *out = 0;
7006
7007 /* Extend unicode object */
7008 outsize = out - startout;
7009 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007010 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007012 /* (in - startin) <= size and size is an int */
7013 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007014
7015error:
7016 Py_XDECREF(encoding_obj);
7017 Py_XDECREF(errorHandler);
7018 Py_XDECREF(exc);
7019 return ret;
7020}
7021
Victor Stinner3a50e702011-10-18 21:21:00 +02007022static PyObject *
7023decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007024 const char *s, Py_ssize_t size,
7025 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026{
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 PyObject *v = NULL;
7028 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029
Victor Stinner3a50e702011-10-18 21:21:00 +02007030 if (code_page < 0) {
7031 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7032 return NULL;
7033 }
7034
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037
Victor Stinner76a31a62011-11-04 00:05:13 +01007038 do
7039 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007041 if (size > INT_MAX) {
7042 chunk_size = INT_MAX;
7043 final = 0;
7044 done = 0;
7045 }
7046 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007048 {
7049 chunk_size = (int)size;
7050 final = (consumed == NULL);
7051 done = 1;
7052 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053
Victor Stinner76a31a62011-11-04 00:05:13 +01007054 if (chunk_size == 0 && done) {
7055 if (v != NULL)
7056 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007057 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007058 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007059
Victor Stinner76a31a62011-11-04 00:05:13 +01007060 converted = decode_code_page_strict(code_page, &v,
7061 s, chunk_size);
7062 if (converted == -2)
7063 converted = decode_code_page_errors(code_page, &v,
7064 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007065 errors, final);
7066 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007067
7068 if (converted < 0) {
7069 Py_XDECREF(v);
7070 return NULL;
7071 }
7072
7073 if (consumed)
7074 *consumed += converted;
7075
7076 s += converted;
7077 size -= converted;
7078 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007079
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007080 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081}
7082
Alexander Belopolsky40018472011-02-26 01:02:56 +00007083PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007084PyUnicode_DecodeCodePageStateful(int code_page,
7085 const char *s,
7086 Py_ssize_t size,
7087 const char *errors,
7088 Py_ssize_t *consumed)
7089{
7090 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7091}
7092
7093PyObject *
7094PyUnicode_DecodeMBCSStateful(const char *s,
7095 Py_ssize_t size,
7096 const char *errors,
7097 Py_ssize_t *consumed)
7098{
7099 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7100}
7101
7102PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007103PyUnicode_DecodeMBCS(const char *s,
7104 Py_ssize_t size,
7105 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007106{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7108}
7109
Victor Stinner3a50e702011-10-18 21:21:00 +02007110static DWORD
7111encode_code_page_flags(UINT code_page, const char *errors)
7112{
7113 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007114 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 }
7116 else if (code_page == CP_UTF7) {
7117 /* CP_UTF7 only supports flags=0 */
7118 return 0;
7119 }
7120 else {
7121 if (errors != NULL && strcmp(errors, "replace") == 0)
7122 return 0;
7123 else
7124 return WC_NO_BEST_FIT_CHARS;
7125 }
7126}
7127
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 * Encode a Unicode string to a Windows code page into a byte string in strict
7130 * mode.
7131 *
7132 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007133 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007135static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007136encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007137 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139{
Victor Stinner554f3f02010-06-16 23:33:54 +00007140 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 BOOL *pusedDefaultChar = &usedDefaultChar;
7142 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007143 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007144 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007145 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 const DWORD flags = encode_code_page_flags(code_page, NULL);
7147 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007148 /* Create a substring so that we can get the UTF-16 representation
7149 of just the slice under consideration. */
7150 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151
Martin v. Löwis3d325192011-11-04 18:23:06 +01007152 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007155 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007157 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007158
Victor Stinner2fc507f2011-11-04 20:06:39 +01007159 substring = PyUnicode_Substring(unicode, offset, offset+len);
7160 if (substring == NULL)
7161 return -1;
7162 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7163 if (p == NULL) {
7164 Py_DECREF(substring);
7165 return -1;
7166 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007167 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007168
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007169 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007171 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 NULL, 0,
7173 NULL, pusedDefaultChar);
7174 if (outsize <= 0)
7175 goto error;
7176 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007177 if (pusedDefaultChar && *pusedDefaultChar) {
7178 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007181
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007185 if (*outbytes == NULL) {
7186 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007188 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190 }
7191 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 const Py_ssize_t n = PyBytes_Size(*outbytes);
7194 if (outsize > PY_SSIZE_T_MAX - n) {
7195 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007199 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7200 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007202 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204 }
7205
7206 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007208 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 out, outsize,
7210 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 if (outsize <= 0)
7213 goto error;
7214 if (pusedDefaultChar && *pusedDefaultChar)
7215 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007216 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007217
Victor Stinner3a50e702011-10-18 21:21:00 +02007218error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007219 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7221 return -2;
7222 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007223 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007224}
7225
Victor Stinner3a50e702011-10-18 21:21:00 +02007226/*
7227 * Encode a Unicode string to a Windows code page into a byte string using a
7228 * error handler.
7229 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007230 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 * -1 on other error.
7232 */
7233static int
7234encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007235 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007236 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007237{
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007239 Py_ssize_t pos = unicode_offset;
7240 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 /* Ideally, we should get reason from FormatMessage. This is the Windows
7242 2000 English version of the message. */
7243 const char *reason = "invalid character";
7244 /* 4=maximum length of a UTF-8 sequence */
7245 char buffer[4];
7246 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7247 Py_ssize_t outsize;
7248 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 PyObject *errorHandler = NULL;
7250 PyObject *exc = NULL;
7251 PyObject *encoding_obj = NULL;
7252 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007253 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 PyObject *rep;
7255 int ret = -1;
7256
7257 assert(insize > 0);
7258
7259 encoding = code_page_name(code_page, &encoding_obj);
7260 if (encoding == NULL)
7261 return -1;
7262
7263 if (errors == NULL || strcmp(errors, "strict") == 0) {
7264 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7265 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007266 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 if (exc != NULL) {
7268 PyCodec_StrictErrors(exc);
7269 Py_DECREF(exc);
7270 }
7271 Py_XDECREF(encoding_obj);
7272 return -1;
7273 }
7274
7275 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7276 pusedDefaultChar = &usedDefaultChar;
7277 else
7278 pusedDefaultChar = NULL;
7279
7280 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7281 PyErr_NoMemory();
7282 goto error;
7283 }
7284 outsize = insize * Py_ARRAY_LENGTH(buffer);
7285
7286 if (*outbytes == NULL) {
7287 /* Create string object */
7288 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7289 if (*outbytes == NULL)
7290 goto error;
7291 out = PyBytes_AS_STRING(*outbytes);
7292 }
7293 else {
7294 /* Extend string object */
7295 Py_ssize_t n = PyBytes_Size(*outbytes);
7296 if (n > PY_SSIZE_T_MAX - outsize) {
7297 PyErr_NoMemory();
7298 goto error;
7299 }
7300 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7301 goto error;
7302 out = PyBytes_AS_STRING(*outbytes) + n;
7303 }
7304
7305 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007306 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007308 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7309 wchar_t chars[2];
7310 int charsize;
7311 if (ch < 0x10000) {
7312 chars[0] = (wchar_t)ch;
7313 charsize = 1;
7314 }
7315 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007316 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7317 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007318 charsize = 2;
7319 }
7320
Victor Stinner3a50e702011-10-18 21:21:00 +02007321 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007322 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 buffer, Py_ARRAY_LENGTH(buffer),
7324 NULL, pusedDefaultChar);
7325 if (outsize > 0) {
7326 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7327 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007328 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 memcpy(out, buffer, outsize);
7330 out += outsize;
7331 continue;
7332 }
7333 }
7334 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7335 PyErr_SetFromWindowsErr(0);
7336 goto error;
7337 }
7338
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 rep = unicode_encode_call_errorhandler(
7340 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007341 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007342 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 if (rep == NULL)
7344 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007345 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007346
7347 if (PyBytes_Check(rep)) {
7348 outsize = PyBytes_GET_SIZE(rep);
7349 if (outsize != 1) {
7350 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7351 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7352 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7353 Py_DECREF(rep);
7354 goto error;
7355 }
7356 out = PyBytes_AS_STRING(*outbytes) + offset;
7357 }
7358 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7359 out += outsize;
7360 }
7361 else {
7362 Py_ssize_t i;
7363 enum PyUnicode_Kind kind;
7364 void *data;
7365
Benjamin Petersonbac79492012-01-14 13:34:47 -05007366 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 Py_DECREF(rep);
7368 goto error;
7369 }
7370
7371 outsize = PyUnicode_GET_LENGTH(rep);
7372 if (outsize != 1) {
7373 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7374 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7375 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7376 Py_DECREF(rep);
7377 goto error;
7378 }
7379 out = PyBytes_AS_STRING(*outbytes) + offset;
7380 }
7381 kind = PyUnicode_KIND(rep);
7382 data = PyUnicode_DATA(rep);
7383 for (i=0; i < outsize; i++) {
7384 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7385 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007386 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007387 encoding, unicode,
7388 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 "unable to encode error handler result to ASCII");
7390 Py_DECREF(rep);
7391 goto error;
7392 }
7393 *out = (unsigned char)ch;
7394 out++;
7395 }
7396 }
7397 Py_DECREF(rep);
7398 }
7399 /* write a NUL byte */
7400 *out = 0;
7401 outsize = out - PyBytes_AS_STRING(*outbytes);
7402 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7403 if (_PyBytes_Resize(outbytes, outsize) < 0)
7404 goto error;
7405 ret = 0;
7406
7407error:
7408 Py_XDECREF(encoding_obj);
7409 Py_XDECREF(errorHandler);
7410 Py_XDECREF(exc);
7411 return ret;
7412}
7413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414static PyObject *
7415encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007416 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 const char *errors)
7418{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007419 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007421 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007422 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007423
Victor Stinner29dacf22015-01-26 16:41:32 +01007424 if (!PyUnicode_Check(unicode)) {
7425 PyErr_BadArgument();
7426 return NULL;
7427 }
7428
Benjamin Petersonbac79492012-01-14 13:34:47 -05007429 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 return NULL;
7431 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007432
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 if (code_page < 0) {
7434 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7435 return NULL;
7436 }
7437
Martin v. Löwis3d325192011-11-04 18:23:06 +01007438 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007439 return PyBytes_FromStringAndSize(NULL, 0);
7440
Victor Stinner7581cef2011-11-03 22:32:33 +01007441 offset = 0;
7442 do
7443 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446 chunks. */
7447 if (len > INT_MAX/2) {
7448 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007449 done = 0;
7450 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007451 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007454 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 done = 1;
7456 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457
Victor Stinner76a31a62011-11-04 00:05:13 +01007458 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007459 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007460 errors);
7461 if (ret == -2)
7462 ret = encode_code_page_errors(code_page, &outbytes,
7463 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007464 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007465 if (ret < 0) {
7466 Py_XDECREF(outbytes);
7467 return NULL;
7468 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469
Victor Stinner7581cef2011-11-03 22:32:33 +01007470 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007471 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007472 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 return outbytes;
7475}
7476
7477PyObject *
7478PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7479 Py_ssize_t size,
7480 const char *errors)
7481{
Victor Stinner7581cef2011-11-03 22:32:33 +01007482 PyObject *unicode, *res;
7483 unicode = PyUnicode_FromUnicode(p, size);
7484 if (unicode == NULL)
7485 return NULL;
7486 res = encode_code_page(CP_ACP, unicode, errors);
7487 Py_DECREF(unicode);
7488 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007489}
7490
7491PyObject *
7492PyUnicode_EncodeCodePage(int code_page,
7493 PyObject *unicode,
7494 const char *errors)
7495{
Victor Stinner7581cef2011-11-03 22:32:33 +01007496 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007497}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007498
Alexander Belopolsky40018472011-02-26 01:02:56 +00007499PyObject *
7500PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007501{
Victor Stinner7581cef2011-11-03 22:32:33 +01007502 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007503}
7504
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505#undef NEED_RETRY
7506
Victor Stinner99b95382011-07-04 14:23:54 +02007507#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007508
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509/* --- Character Mapping Codec -------------------------------------------- */
7510
Victor Stinnerfb161b12013-04-18 01:44:27 +02007511static int
7512charmap_decode_string(const char *s,
7513 Py_ssize_t size,
7514 PyObject *mapping,
7515 const char *errors,
7516 _PyUnicodeWriter *writer)
7517{
7518 const char *starts = s;
7519 const char *e;
7520 Py_ssize_t startinpos, endinpos;
7521 PyObject *errorHandler = NULL, *exc = NULL;
7522 Py_ssize_t maplen;
7523 enum PyUnicode_Kind mapkind;
7524 void *mapdata;
7525 Py_UCS4 x;
7526 unsigned char ch;
7527
7528 if (PyUnicode_READY(mapping) == -1)
7529 return -1;
7530
7531 maplen = PyUnicode_GET_LENGTH(mapping);
7532 mapdata = PyUnicode_DATA(mapping);
7533 mapkind = PyUnicode_KIND(mapping);
7534
7535 e = s + size;
7536
7537 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7538 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7539 * is disabled in encoding aliases, latin1 is preferred because
7540 * its implementation is faster. */
7541 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7542 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7543 Py_UCS4 maxchar = writer->maxchar;
7544
7545 assert (writer->kind == PyUnicode_1BYTE_KIND);
7546 while (s < e) {
7547 ch = *s;
7548 x = mapdata_ucs1[ch];
7549 if (x > maxchar) {
7550 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7551 goto onError;
7552 maxchar = writer->maxchar;
7553 outdata = (Py_UCS1 *)writer->data;
7554 }
7555 outdata[writer->pos] = x;
7556 writer->pos++;
7557 ++s;
7558 }
7559 return 0;
7560 }
7561
7562 while (s < e) {
7563 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7564 enum PyUnicode_Kind outkind = writer->kind;
7565 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7566 if (outkind == PyUnicode_1BYTE_KIND) {
7567 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7568 Py_UCS4 maxchar = writer->maxchar;
7569 while (s < e) {
7570 ch = *s;
7571 x = mapdata_ucs2[ch];
7572 if (x > maxchar)
7573 goto Error;
7574 outdata[writer->pos] = x;
7575 writer->pos++;
7576 ++s;
7577 }
7578 break;
7579 }
7580 else if (outkind == PyUnicode_2BYTE_KIND) {
7581 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7582 while (s < e) {
7583 ch = *s;
7584 x = mapdata_ucs2[ch];
7585 if (x == 0xFFFE)
7586 goto Error;
7587 outdata[writer->pos] = x;
7588 writer->pos++;
7589 ++s;
7590 }
7591 break;
7592 }
7593 }
7594 ch = *s;
7595
7596 if (ch < maplen)
7597 x = PyUnicode_READ(mapkind, mapdata, ch);
7598 else
7599 x = 0xfffe; /* invalid value */
7600Error:
7601 if (x == 0xfffe)
7602 {
7603 /* undefined mapping */
7604 startinpos = s-starts;
7605 endinpos = startinpos+1;
7606 if (unicode_decode_call_errorhandler_writer(
7607 errors, &errorHandler,
7608 "charmap", "character maps to <undefined>",
7609 &starts, &e, &startinpos, &endinpos, &exc, &s,
7610 writer)) {
7611 goto onError;
7612 }
7613 continue;
7614 }
7615
7616 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7617 goto onError;
7618 ++s;
7619 }
7620 Py_XDECREF(errorHandler);
7621 Py_XDECREF(exc);
7622 return 0;
7623
7624onError:
7625 Py_XDECREF(errorHandler);
7626 Py_XDECREF(exc);
7627 return -1;
7628}
7629
7630static int
7631charmap_decode_mapping(const char *s,
7632 Py_ssize_t size,
7633 PyObject *mapping,
7634 const char *errors,
7635 _PyUnicodeWriter *writer)
7636{
7637 const char *starts = s;
7638 const char *e;
7639 Py_ssize_t startinpos, endinpos;
7640 PyObject *errorHandler = NULL, *exc = NULL;
7641 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007642 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007643
7644 e = s + size;
7645
7646 while (s < e) {
7647 ch = *s;
7648
7649 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7650 key = PyLong_FromLong((long)ch);
7651 if (key == NULL)
7652 goto onError;
7653
7654 item = PyObject_GetItem(mapping, key);
7655 Py_DECREF(key);
7656 if (item == NULL) {
7657 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7658 /* No mapping found means: mapping is undefined. */
7659 PyErr_Clear();
7660 goto Undefined;
7661 } else
7662 goto onError;
7663 }
7664
7665 /* Apply mapping */
7666 if (item == Py_None)
7667 goto Undefined;
7668 if (PyLong_Check(item)) {
7669 long value = PyLong_AS_LONG(item);
7670 if (value == 0xFFFE)
7671 goto Undefined;
7672 if (value < 0 || value > MAX_UNICODE) {
7673 PyErr_Format(PyExc_TypeError,
7674 "character mapping must be in range(0x%lx)",
7675 (unsigned long)MAX_UNICODE + 1);
7676 goto onError;
7677 }
7678
7679 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7680 goto onError;
7681 }
7682 else if (PyUnicode_Check(item)) {
7683 if (PyUnicode_READY(item) == -1)
7684 goto onError;
7685 if (PyUnicode_GET_LENGTH(item) == 1) {
7686 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7687 if (value == 0xFFFE)
7688 goto Undefined;
7689 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7690 goto onError;
7691 }
7692 else {
7693 writer->overallocate = 1;
7694 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7695 goto onError;
7696 }
7697 }
7698 else {
7699 /* wrong return value */
7700 PyErr_SetString(PyExc_TypeError,
7701 "character mapping must return integer, None or str");
7702 goto onError;
7703 }
7704 Py_CLEAR(item);
7705 ++s;
7706 continue;
7707
7708Undefined:
7709 /* undefined mapping */
7710 Py_CLEAR(item);
7711 startinpos = s-starts;
7712 endinpos = startinpos+1;
7713 if (unicode_decode_call_errorhandler_writer(
7714 errors, &errorHandler,
7715 "charmap", "character maps to <undefined>",
7716 &starts, &e, &startinpos, &endinpos, &exc, &s,
7717 writer)) {
7718 goto onError;
7719 }
7720 }
7721 Py_XDECREF(errorHandler);
7722 Py_XDECREF(exc);
7723 return 0;
7724
7725onError:
7726 Py_XDECREF(item);
7727 Py_XDECREF(errorHandler);
7728 Py_XDECREF(exc);
7729 return -1;
7730}
7731
Alexander Belopolsky40018472011-02-26 01:02:56 +00007732PyObject *
7733PyUnicode_DecodeCharmap(const char *s,
7734 Py_ssize_t size,
7735 PyObject *mapping,
7736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007738 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007739
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 /* Default to Latin-1 */
7741 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007745 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007746 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007747 writer.min_length = size;
7748 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007750
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007751 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007752 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7753 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007754 }
7755 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007756 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7757 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007759 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007760
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007762 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 return NULL;
7764}
7765
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766/* Charmap encoding: the lookup table */
7767
Alexander Belopolsky40018472011-02-26 01:02:56 +00007768struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 PyObject_HEAD
7770 unsigned char level1[32];
7771 int count2, count3;
7772 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773};
7774
7775static PyObject*
7776encoding_map_size(PyObject *obj, PyObject* args)
7777{
7778 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007779 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781}
7782
7783static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 PyDoc_STR("Return the size (in bytes) of this object") },
7786 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787};
7788
7789static void
7790encoding_map_dealloc(PyObject* o)
7791{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007793}
7794
7795static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007796 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 "EncodingMap", /*tp_name*/
7798 sizeof(struct encoding_map), /*tp_basicsize*/
7799 0, /*tp_itemsize*/
7800 /* methods */
7801 encoding_map_dealloc, /*tp_dealloc*/
7802 0, /*tp_print*/
7803 0, /*tp_getattr*/
7804 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007805 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 0, /*tp_repr*/
7807 0, /*tp_as_number*/
7808 0, /*tp_as_sequence*/
7809 0, /*tp_as_mapping*/
7810 0, /*tp_hash*/
7811 0, /*tp_call*/
7812 0, /*tp_str*/
7813 0, /*tp_getattro*/
7814 0, /*tp_setattro*/
7815 0, /*tp_as_buffer*/
7816 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7817 0, /*tp_doc*/
7818 0, /*tp_traverse*/
7819 0, /*tp_clear*/
7820 0, /*tp_richcompare*/
7821 0, /*tp_weaklistoffset*/
7822 0, /*tp_iter*/
7823 0, /*tp_iternext*/
7824 encoding_map_methods, /*tp_methods*/
7825 0, /*tp_members*/
7826 0, /*tp_getset*/
7827 0, /*tp_base*/
7828 0, /*tp_dict*/
7829 0, /*tp_descr_get*/
7830 0, /*tp_descr_set*/
7831 0, /*tp_dictoffset*/
7832 0, /*tp_init*/
7833 0, /*tp_alloc*/
7834 0, /*tp_new*/
7835 0, /*tp_free*/
7836 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837};
7838
7839PyObject*
7840PyUnicode_BuildEncodingMap(PyObject* string)
7841{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007842 PyObject *result;
7843 struct encoding_map *mresult;
7844 int i;
7845 int need_dict = 0;
7846 unsigned char level1[32];
7847 unsigned char level2[512];
7848 unsigned char *mlevel1, *mlevel2, *mlevel3;
7849 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850 int kind;
7851 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007852 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007853 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007855 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007856 PyErr_BadArgument();
7857 return NULL;
7858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 kind = PyUnicode_KIND(string);
7860 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007861 length = PyUnicode_GET_LENGTH(string);
7862 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863 memset(level1, 0xFF, sizeof level1);
7864 memset(level2, 0xFF, sizeof level2);
7865
7866 /* If there isn't a one-to-one mapping of NULL to \0,
7867 or if there are non-BMP characters, we need to use
7868 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007869 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007871 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007872 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007873 ch = PyUnicode_READ(kind, data, i);
7874 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007875 need_dict = 1;
7876 break;
7877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879 /* unmapped character */
7880 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 l1 = ch >> 11;
7882 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 if (level1[l1] == 0xFF)
7884 level1[l1] = count2++;
7885 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007886 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 }
7888
7889 if (count2 >= 0xFF || count3 >= 0xFF)
7890 need_dict = 1;
7891
7892 if (need_dict) {
7893 PyObject *result = PyDict_New();
7894 PyObject *key, *value;
7895 if (!result)
7896 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007897 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007898 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007899 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007900 if (!key || !value)
7901 goto failed1;
7902 if (PyDict_SetItem(result, key, value) == -1)
7903 goto failed1;
7904 Py_DECREF(key);
7905 Py_DECREF(value);
7906 }
7907 return result;
7908 failed1:
7909 Py_XDECREF(key);
7910 Py_XDECREF(value);
7911 Py_DECREF(result);
7912 return NULL;
7913 }
7914
7915 /* Create a three-level trie */
7916 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7917 16*count2 + 128*count3 - 1);
7918 if (!result)
7919 return PyErr_NoMemory();
7920 PyObject_Init(result, &EncodingMapType);
7921 mresult = (struct encoding_map*)result;
7922 mresult->count2 = count2;
7923 mresult->count3 = count3;
7924 mlevel1 = mresult->level1;
7925 mlevel2 = mresult->level23;
7926 mlevel3 = mresult->level23 + 16*count2;
7927 memcpy(mlevel1, level1, 32);
7928 memset(mlevel2, 0xFF, 16*count2);
7929 memset(mlevel3, 0, 128*count3);
7930 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007931 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007933 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7934 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007935 /* unmapped character */
7936 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007937 o1 = ch>>11;
7938 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007939 i2 = 16*mlevel1[o1] + o2;
7940 if (mlevel2[i2] == 0xFF)
7941 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007942 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943 i3 = 128*mlevel2[i2] + o3;
7944 mlevel3[i3] = i;
7945 }
7946 return result;
7947}
7948
7949static int
Victor Stinner22168992011-11-20 17:09:18 +01007950encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951{
7952 struct encoding_map *map = (struct encoding_map*)mapping;
7953 int l1 = c>>11;
7954 int l2 = (c>>7) & 0xF;
7955 int l3 = c & 0x7F;
7956 int i;
7957
Victor Stinner22168992011-11-20 17:09:18 +01007958 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960 if (c == 0)
7961 return 0;
7962 /* level 1*/
7963 i = map->level1[l1];
7964 if (i == 0xFF) {
7965 return -1;
7966 }
7967 /* level 2*/
7968 i = map->level23[16*i+l2];
7969 if (i == 0xFF) {
7970 return -1;
7971 }
7972 /* level 3 */
7973 i = map->level23[16*map->count2 + 128*i + l3];
7974 if (i == 0) {
7975 return -1;
7976 }
7977 return i;
7978}
7979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980/* Lookup the character ch in the mapping. If the character
7981 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007982 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007983static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007984charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985{
Christian Heimes217cfd12007-12-02 14:31:20 +00007986 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007987 PyObject *x;
7988
7989 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007991 x = PyObject_GetItem(mapping, w);
7992 Py_DECREF(w);
7993 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7995 /* No mapping found means: mapping is undefined. */
7996 PyErr_Clear();
7997 x = Py_None;
7998 Py_INCREF(x);
7999 return x;
8000 } else
8001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008003 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008005 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 long value = PyLong_AS_LONG(x);
8007 if (value < 0 || value > 255) {
8008 PyErr_SetString(PyExc_TypeError,
8009 "character mapping must be in range(256)");
8010 Py_DECREF(x);
8011 return NULL;
8012 }
8013 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008015 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 /* wrong return value */
8019 PyErr_Format(PyExc_TypeError,
8020 "character mapping must return integer, bytes or None, not %.400s",
8021 x->ob_type->tp_name);
8022 Py_DECREF(x);
8023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 }
8025}
8026
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008027static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008028charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008030 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8031 /* exponentially overallocate to minimize reallocations */
8032 if (requiredsize < 2*outsize)
8033 requiredsize = 2*outsize;
8034 if (_PyBytes_Resize(outobj, requiredsize))
8035 return -1;
8036 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037}
8038
Benjamin Peterson14339b62009-01-31 16:36:08 +00008039typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008041} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008043 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044 space is available. Return a new reference to the object that
8045 was put in the output buffer, or Py_None, if the mapping was undefined
8046 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008047 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008048static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008049charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052 PyObject *rep;
8053 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008054 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008055
Christian Heimes90aa7642007-12-19 02:45:37 +00008056 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059 if (res == -1)
8060 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 if (outsize<requiredsize)
8062 if (charmapencode_resize(outobj, outpos, requiredsize))
8063 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008064 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 outstart[(*outpos)++] = (char)res;
8066 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 }
8068
8069 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 Py_DECREF(rep);
8074 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 if (PyLong_Check(rep)) {
8077 Py_ssize_t requiredsize = *outpos+1;
8078 if (outsize<requiredsize)
8079 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8080 Py_DECREF(rep);
8081 return enc_EXCEPTION;
8082 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008083 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 else {
8087 const char *repchars = PyBytes_AS_STRING(rep);
8088 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8089 Py_ssize_t requiredsize = *outpos+repsize;
8090 if (outsize<requiredsize)
8091 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8092 Py_DECREF(rep);
8093 return enc_EXCEPTION;
8094 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008095 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 memcpy(outstart + *outpos, repchars, repsize);
8097 *outpos += repsize;
8098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 Py_DECREF(rep);
8101 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102}
8103
8104/* handle an error in PyUnicode_EncodeCharmap
8105 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008106static int
8107charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008108 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008110 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008111 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112{
8113 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008114 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008115 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008116 enum PyUnicode_Kind kind;
8117 void *data;
8118 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008120 Py_ssize_t collstartpos = *inpos;
8121 Py_ssize_t collendpos = *inpos+1;
8122 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008123 char *encoding = "charmap";
8124 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008126 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008127 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128
Benjamin Petersonbac79492012-01-14 13:34:47 -05008129 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 return -1;
8131 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132 /* find all unencodable characters */
8133 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008135 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008137 val = encoding_map_lookup(ch, mapping);
8138 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 break;
8140 ++collendpos;
8141 continue;
8142 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8145 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 if (rep==NULL)
8147 return -1;
8148 else if (rep!=Py_None) {
8149 Py_DECREF(rep);
8150 break;
8151 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008154 }
8155 /* cache callback name lookup
8156 * (if not done yet, i.e. it's the first error) */
8157 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 if ((errors==NULL) || (!strcmp(errors, "strict")))
8159 *known_errorHandler = 1;
8160 else if (!strcmp(errors, "replace"))
8161 *known_errorHandler = 2;
8162 else if (!strcmp(errors, "ignore"))
8163 *known_errorHandler = 3;
8164 else if (!strcmp(errors, "xmlcharrefreplace"))
8165 *known_errorHandler = 4;
8166 else
8167 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008168 }
8169 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008171 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 return -1;
8173 case 2: /* replace */
8174 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 x = charmapencode_output('?', mapping, res, respos);
8176 if (x==enc_EXCEPTION) {
8177 return -1;
8178 }
8179 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008180 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 return -1;
8182 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 }
8184 /* fall through */
8185 case 3: /* ignore */
8186 *inpos = collendpos;
8187 break;
8188 case 4: /* xmlcharrefreplace */
8189 /* generate replacement (temporarily (mis)uses p) */
8190 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 char buffer[2+29+1+1];
8192 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008193 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 for (cp = buffer; *cp; ++cp) {
8195 x = charmapencode_output(*cp, mapping, res, respos);
8196 if (x==enc_EXCEPTION)
8197 return -1;
8198 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008199 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 return -1;
8201 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 }
8203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 *inpos = collendpos;
8205 break;
8206 default:
8207 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008208 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008212 if (PyBytes_Check(repunicode)) {
8213 /* Directly copy bytes result to output. */
8214 Py_ssize_t outsize = PyBytes_Size(*res);
8215 Py_ssize_t requiredsize;
8216 repsize = PyBytes_Size(repunicode);
8217 requiredsize = *respos + repsize;
8218 if (requiredsize > outsize)
8219 /* Make room for all additional bytes. */
8220 if (charmapencode_resize(res, respos, requiredsize)) {
8221 Py_DECREF(repunicode);
8222 return -1;
8223 }
8224 memcpy(PyBytes_AsString(*res) + *respos,
8225 PyBytes_AsString(repunicode), repsize);
8226 *respos += repsize;
8227 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008228 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008229 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008230 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008231 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008232 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008233 Py_DECREF(repunicode);
8234 return -1;
8235 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008236 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008237 data = PyUnicode_DATA(repunicode);
8238 kind = PyUnicode_KIND(repunicode);
8239 for (index = 0; index < repsize; index++) {
8240 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8241 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008243 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return -1;
8245 }
8246 else if (x==enc_FAILED) {
8247 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008248 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 return -1;
8250 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008251 }
8252 *inpos = newpos;
8253 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 }
8255 return 0;
8256}
8257
Alexander Belopolsky40018472011-02-26 01:02:56 +00008258PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008259_PyUnicode_EncodeCharmap(PyObject *unicode,
8260 PyObject *mapping,
8261 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 /* output object */
8264 PyObject *res = NULL;
8265 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008266 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008269 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 PyObject *errorHandler = NULL;
8271 PyObject *exc = NULL;
8272 /* the following variable is used for caching string comparisons
8273 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8274 * 3=ignore, 4=xmlcharrefreplace */
8275 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008276 void *data;
8277 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
Benjamin Petersonbac79492012-01-14 13:34:47 -05008279 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008280 return NULL;
8281 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008282 data = PyUnicode_DATA(unicode);
8283 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008284
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 /* Default to Latin-1 */
8286 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008287 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 /* allocate enough for a simple encoding without
8290 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008291 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 if (res == NULL)
8293 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008294 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008298 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008300 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 if (x==enc_EXCEPTION) /* error */
8302 goto onError;
8303 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008304 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 &exc,
8306 &known_errorHandler, &errorHandler, errors,
8307 &res, &respos)) {
8308 goto onError;
8309 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 else
8312 /* done with this character => adjust input position */
8313 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008317 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008318 if (_PyBytes_Resize(&res, respos) < 0)
8319 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 Py_XDECREF(exc);
8322 Py_XDECREF(errorHandler);
8323 return res;
8324
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326 Py_XDECREF(res);
8327 Py_XDECREF(exc);
8328 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 return NULL;
8330}
8331
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008332/* Deprecated */
8333PyObject *
8334PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8335 Py_ssize_t size,
8336 PyObject *mapping,
8337 const char *errors)
8338{
8339 PyObject *result;
8340 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8341 if (unicode == NULL)
8342 return NULL;
8343 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8344 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008345 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008346}
8347
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348PyObject *
8349PyUnicode_AsCharmapString(PyObject *unicode,
8350 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
8352 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 PyErr_BadArgument();
8354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357}
8358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360static void
8361make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363 Py_ssize_t startpos, Py_ssize_t endpos,
8364 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 *exceptionObject = _PyUnicodeTranslateError_Create(
8368 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 }
8370 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8372 goto onError;
8373 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8374 goto onError;
8375 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8376 goto onError;
8377 return;
8378 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008379 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 }
8381}
8382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383/* error handling callback helper:
8384 build arguments, call the callback and check the arguments,
8385 put the result into newpos and return the replacement string, which
8386 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387static PyObject *
8388unicode_translate_call_errorhandler(const char *errors,
8389 PyObject **errorHandler,
8390 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008392 Py_ssize_t startpos, Py_ssize_t endpos,
8393 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008395 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008397 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 PyObject *restuple;
8399 PyObject *resunicode;
8400
8401 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 }
8406
8407 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411
8412 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008417 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 Py_DECREF(restuple);
8419 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 }
8421 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 &resunicode, &i_newpos)) {
8423 Py_DECREF(restuple);
8424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008426 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008428 else
8429 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008431 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 Py_DECREF(restuple);
8433 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 Py_INCREF(resunicode);
8436 Py_DECREF(restuple);
8437 return resunicode;
8438}
8439
8440/* Lookup the character ch in the mapping and put the result in result,
8441 which must be decrefed by the caller.
8442 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008443static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445{
Christian Heimes217cfd12007-12-02 14:31:20 +00008446 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 PyObject *x;
8448
8449 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 x = PyObject_GetItem(mapping, w);
8452 Py_DECREF(w);
8453 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8455 /* No mapping found means: use 1:1 mapping. */
8456 PyErr_Clear();
8457 *result = NULL;
8458 return 0;
8459 } else
8460 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 }
8462 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 *result = x;
8464 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008466 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008468 if (value < 0 || value > MAX_UNICODE) {
8469 PyErr_Format(PyExc_ValueError,
8470 "character mapping must be in range(0x%x)",
8471 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 Py_DECREF(x);
8473 return -1;
8474 }
8475 *result = x;
8476 return 0;
8477 }
8478 else if (PyUnicode_Check(x)) {
8479 *result = x;
8480 return 0;
8481 }
8482 else {
8483 /* wrong return value */
8484 PyErr_SetString(PyExc_TypeError,
8485 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008486 Py_DECREF(x);
8487 return -1;
8488 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489}
Victor Stinner1194ea02014-04-04 19:37:40 +02008490
8491/* lookup the character, write the result into the writer.
8492 Return 1 if the result was written into the writer, return 0 if the mapping
8493 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008494static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008495charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8496 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497{
Victor Stinner1194ea02014-04-04 19:37:40 +02008498 PyObject *item;
8499
8500 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008502
8503 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008505 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008508 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008510
8511 if (item == Py_None) {
8512 Py_DECREF(item);
8513 return 0;
8514 }
8515
8516 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008517 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8518 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8519 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008520 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8521 Py_DECREF(item);
8522 return -1;
8523 }
8524 Py_DECREF(item);
8525 return 1;
8526 }
8527
8528 if (!PyUnicode_Check(item)) {
8529 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008531 }
8532
8533 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8534 Py_DECREF(item);
8535 return -1;
8536 }
8537
8538 Py_DECREF(item);
8539 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540}
8541
Victor Stinner89a76ab2014-04-05 11:44:04 +02008542static int
8543unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8544 Py_UCS1 *translate)
8545{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008546 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008547 int ret = 0;
8548
Victor Stinner89a76ab2014-04-05 11:44:04 +02008549 if (charmaptranslate_lookup(ch, mapping, &item)) {
8550 return -1;
8551 }
8552
8553 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008554 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008555 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008556 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008557 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008558 /* not found => default to 1:1 mapping */
8559 translate[ch] = ch;
8560 return 1;
8561 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008562 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008563 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008564 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8565 used it */
8566 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008567 /* invalid character or character outside ASCII:
8568 skip the fast translate */
8569 goto exit;
8570 }
8571 translate[ch] = (Py_UCS1)replace;
8572 }
8573 else if (PyUnicode_Check(item)) {
8574 Py_UCS4 replace;
8575
8576 if (PyUnicode_READY(item) == -1) {
8577 Py_DECREF(item);
8578 return -1;
8579 }
8580 if (PyUnicode_GET_LENGTH(item) != 1)
8581 goto exit;
8582
8583 replace = PyUnicode_READ_CHAR(item, 0);
8584 if (replace > 127)
8585 goto exit;
8586 translate[ch] = (Py_UCS1)replace;
8587 }
8588 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008589 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008590 goto exit;
8591 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008592 ret = 1;
8593
Benjamin Peterson1365de72014-04-07 20:15:41 -04008594 exit:
8595 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008596 return ret;
8597}
8598
8599/* Fast path for ascii => ascii translation. Return 1 if the whole string
8600 was translated into writer, return 0 if the input string was partially
8601 translated into writer, raise an exception and return -1 on error. */
8602static int
8603unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008604 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008605{
Victor Stinner872b2912014-04-05 14:27:07 +02008606 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008607 Py_ssize_t len;
8608 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008609 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008610
8611 if (PyUnicode_READY(input) == -1)
8612 return -1;
8613 if (!PyUnicode_IS_ASCII(input))
8614 return 0;
8615 len = PyUnicode_GET_LENGTH(input);
8616
Victor Stinner872b2912014-04-05 14:27:07 +02008617 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008618
8619 in = PyUnicode_1BYTE_DATA(input);
8620 end = in + len;
8621
8622 assert(PyUnicode_IS_ASCII(writer->buffer));
8623 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8624 out = PyUnicode_1BYTE_DATA(writer->buffer);
8625
Victor Stinner872b2912014-04-05 14:27:07 +02008626 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008627 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008628 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008629 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008630 int translate = unicode_fast_translate_lookup(mapping, ch,
8631 ascii_table);
8632 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008633 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008634 if (translate == 0)
8635 goto exit;
8636 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008637 }
Victor Stinner872b2912014-04-05 14:27:07 +02008638 if (ch2 == 0xfe) {
8639 if (ignore)
8640 continue;
8641 goto exit;
8642 }
8643 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008644 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008645 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008646 }
Victor Stinner872b2912014-04-05 14:27:07 +02008647 res = 1;
8648
8649exit:
8650 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8651 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008652}
8653
Alexander Belopolsky40018472011-02-26 01:02:56 +00008654PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655_PyUnicode_TranslateCharmap(PyObject *input,
8656 PyObject *mapping,
8657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008660 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 Py_ssize_t size, i;
8662 int kind;
8663 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008664 _PyUnicodeWriter writer;
8665 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 char *reason = "character maps to <undefined>";
8667 PyObject *errorHandler = NULL;
8668 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008669 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008670 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 PyErr_BadArgument();
8674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 if (PyUnicode_READY(input) == -1)
8678 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008679 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 kind = PyUnicode_KIND(input);
8681 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682
8683 if (size == 0) {
8684 Py_INCREF(input);
8685 return input;
8686 }
8687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 /* allocate enough for a simple 1:1 translation without
8689 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008690 _PyUnicodeWriter_Init(&writer);
8691 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693
Victor Stinner872b2912014-04-05 14:27:07 +02008694 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8695
8696 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008697 if (res < 0) {
8698 _PyUnicodeWriter_Dealloc(&writer);
8699 return NULL;
8700 }
8701 if (res == 1)
8702 return _PyUnicodeWriter_Finish(&writer);
8703
Victor Stinner89a76ab2014-04-05 11:44:04 +02008704 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008707 int translate;
8708 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8709 Py_ssize_t newpos;
8710 /* startpos for collecting untranslatable chars */
8711 Py_ssize_t collstart;
8712 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008713 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714
Victor Stinner1194ea02014-04-04 19:37:40 +02008715 ch = PyUnicode_READ(kind, data, i);
8716 translate = charmaptranslate_output(ch, mapping, &writer);
8717 if (translate < 0)
8718 goto onError;
8719
8720 if (translate != 0) {
8721 /* it worked => adjust input pointer */
8722 ++i;
8723 continue;
8724 }
8725
8726 /* untranslatable character */
8727 collstart = i;
8728 collend = i+1;
8729
8730 /* find all untranslatable characters */
8731 while (collend < size) {
8732 PyObject *x;
8733 ch = PyUnicode_READ(kind, data, collend);
8734 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008735 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008736 Py_XDECREF(x);
8737 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008739 ++collend;
8740 }
8741
8742 if (ignore) {
8743 i = collend;
8744 }
8745 else {
8746 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8747 reason, input, &exc,
8748 collstart, collend, &newpos);
8749 if (repunicode == NULL)
8750 goto onError;
8751 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008753 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008754 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008755 Py_DECREF(repunicode);
8756 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 }
8758 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 Py_XDECREF(exc);
8760 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008761 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008764 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765 Py_XDECREF(exc);
8766 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 return NULL;
8768}
8769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770/* Deprecated. Use PyUnicode_Translate instead. */
8771PyObject *
8772PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8773 Py_ssize_t size,
8774 PyObject *mapping,
8775 const char *errors)
8776{
Christian Heimes5f520f42012-09-11 14:03:25 +02008777 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8779 if (!unicode)
8780 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008781 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8782 Py_DECREF(unicode);
8783 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784}
8785
Alexander Belopolsky40018472011-02-26 01:02:56 +00008786PyObject *
8787PyUnicode_Translate(PyObject *str,
8788 PyObject *mapping,
8789 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790{
8791 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008792
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 str = PyUnicode_FromObject(str);
8794 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008795 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 Py_DECREF(str);
8798 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799}
Tim Petersced69f82003-09-16 20:30:58 +00008800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008802fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803{
8804 /* No need to call PyUnicode_READY(self) because this function is only
8805 called as a callback from fixup() which does it already. */
8806 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8807 const int kind = PyUnicode_KIND(self);
8808 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008809 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008810 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 Py_ssize_t i;
8812
8813 for (i = 0; i < len; ++i) {
8814 ch = PyUnicode_READ(kind, data, i);
8815 fixed = 0;
8816 if (ch > 127) {
8817 if (Py_UNICODE_ISSPACE(ch))
8818 fixed = ' ';
8819 else {
8820 const int decimal = Py_UNICODE_TODECIMAL(ch);
8821 if (decimal >= 0)
8822 fixed = '0' + decimal;
8823 }
8824 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008825 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008826 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 PyUnicode_WRITE(kind, data, i, fixed);
8828 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008829 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008830 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 }
8833
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008834 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835}
8836
8837PyObject *
8838_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8839{
8840 if (!PyUnicode_Check(unicode)) {
8841 PyErr_BadInternalCall();
8842 return NULL;
8843 }
8844 if (PyUnicode_READY(unicode) == -1)
8845 return NULL;
8846 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8847 /* If the string is already ASCII, just return the same string */
8848 Py_INCREF(unicode);
8849 return unicode;
8850 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008851 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852}
8853
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008854PyObject *
8855PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8856 Py_ssize_t length)
8857{
Victor Stinnerf0124502011-11-21 23:12:56 +01008858 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008859 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008860 Py_UCS4 maxchar;
8861 enum PyUnicode_Kind kind;
8862 void *data;
8863
Victor Stinner99d7ad02012-02-22 13:37:39 +01008864 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008865 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008866 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008867 if (ch > 127) {
8868 int decimal = Py_UNICODE_TODECIMAL(ch);
8869 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008870 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008871 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008872 }
8873 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008874
8875 /* Copy to a new string */
8876 decimal = PyUnicode_New(length, maxchar);
8877 if (decimal == NULL)
8878 return decimal;
8879 kind = PyUnicode_KIND(decimal);
8880 data = PyUnicode_DATA(decimal);
8881 /* Iterate over code points */
8882 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008883 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008884 if (ch > 127) {
8885 int decimal = Py_UNICODE_TODECIMAL(ch);
8886 if (decimal >= 0)
8887 ch = '0' + decimal;
8888 }
8889 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008891 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008892}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008893/* --- Decimal Encoder ---------------------------------------------------- */
8894
Alexander Belopolsky40018472011-02-26 01:02:56 +00008895int
8896PyUnicode_EncodeDecimal(Py_UNICODE *s,
8897 Py_ssize_t length,
8898 char *output,
8899 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008900{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008901 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008902 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008903 enum PyUnicode_Kind kind;
8904 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008905
8906 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 PyErr_BadArgument();
8908 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008909 }
8910
Victor Stinner42bf7752011-11-21 22:52:58 +01008911 unicode = PyUnicode_FromUnicode(s, length);
8912 if (unicode == NULL)
8913 return -1;
8914
Benjamin Petersonbac79492012-01-14 13:34:47 -05008915 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008916 Py_DECREF(unicode);
8917 return -1;
8918 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008919 kind = PyUnicode_KIND(unicode);
8920 data = PyUnicode_DATA(unicode);
8921
Victor Stinnerb84d7232011-11-22 01:50:07 +01008922 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008923 PyObject *exc;
8924 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008926 Py_ssize_t startpos;
8927
8928 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008929
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008931 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008932 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008934 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 decimal = Py_UNICODE_TODECIMAL(ch);
8936 if (decimal >= 0) {
8937 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008938 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 continue;
8940 }
8941 if (0 < ch && ch < 256) {
8942 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008943 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 continue;
8945 }
Victor Stinner6345be92011-11-25 20:09:01 +01008946
Victor Stinner42bf7752011-11-21 22:52:58 +01008947 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008948 exc = NULL;
8949 raise_encode_exception(&exc, "decimal", unicode,
8950 startpos, startpos+1,
8951 "invalid decimal Unicode string");
8952 Py_XDECREF(exc);
8953 Py_DECREF(unicode);
8954 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008955 }
8956 /* 0-terminate the output string */
8957 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008958 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008959 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008960}
8961
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962/* --- Helpers ------------------------------------------------------------ */
8963
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008964/* helper macro to fixup start/end slice values */
8965#define ADJUST_INDICES(start, end, len) \
8966 if (end > len) \
8967 end = len; \
8968 else if (end < 0) { \
8969 end += len; \
8970 if (end < 0) \
8971 end = 0; \
8972 } \
8973 if (start < 0) { \
8974 start += len; \
8975 if (start < 0) \
8976 start = 0; \
8977 }
8978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008980any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 Py_ssize_t start,
8982 Py_ssize_t end)
8983{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008984 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 void *buf1, *buf2;
8986 Py_ssize_t len1, len2, result;
8987
8988 kind1 = PyUnicode_KIND(s1);
8989 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008990 if (kind1 < kind2)
8991 return -1;
8992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 len1 = PyUnicode_GET_LENGTH(s1);
8994 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008995 ADJUST_INDICES(start, end, len1);
8996 if (end - start < len2)
8997 return -1;
8998
8999 buf1 = PyUnicode_DATA(s1);
9000 buf2 = PyUnicode_DATA(s2);
9001 if (len2 == 1) {
9002 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9003 result = findchar((const char *)buf1 + kind1*start,
9004 kind1, end - start, ch, direction);
9005 if (result == -1)
9006 return -1;
9007 else
9008 return start + result;
9009 }
9010
9011 if (kind2 != kind1) {
9012 buf2 = _PyUnicode_AsKind(s2, kind1);
9013 if (!buf2)
9014 return -2;
9015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016
Victor Stinner794d5672011-10-10 03:21:36 +02009017 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009018 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009019 case PyUnicode_1BYTE_KIND:
9020 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9021 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9022 else
9023 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9024 break;
9025 case PyUnicode_2BYTE_KIND:
9026 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9027 break;
9028 case PyUnicode_4BYTE_KIND:
9029 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9030 break;
9031 default:
9032 assert(0); result = -2;
9033 }
9034 }
9035 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009036 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009037 case PyUnicode_1BYTE_KIND:
9038 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9039 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9040 else
9041 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9042 break;
9043 case PyUnicode_2BYTE_KIND:
9044 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9045 break;
9046 case PyUnicode_4BYTE_KIND:
9047 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9048 break;
9049 default:
9050 assert(0); result = -2;
9051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 }
9053
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009054 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 PyMem_Free(buf2);
9056
9057 return result;
9058}
9059
9060Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009061_PyUnicode_InsertThousandsGrouping(
9062 PyObject *unicode, Py_ssize_t index,
9063 Py_ssize_t n_buffer,
9064 void *digits, Py_ssize_t n_digits,
9065 Py_ssize_t min_width,
9066 const char *grouping, PyObject *thousands_sep,
9067 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068{
Victor Stinner41a863c2012-02-24 00:37:51 +01009069 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009070 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009071 Py_ssize_t thousands_sep_len;
9072 Py_ssize_t len;
9073
9074 if (unicode != NULL) {
9075 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009076 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009077 }
9078 else {
9079 kind = PyUnicode_1BYTE_KIND;
9080 data = NULL;
9081 }
9082 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9083 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9084 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9085 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009086 if (thousands_sep_kind < kind) {
9087 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9088 if (!thousands_sep_data)
9089 return -1;
9090 }
9091 else {
9092 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9093 if (!data)
9094 return -1;
9095 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009096 }
9097
Benjamin Petersonead6b532011-12-20 17:23:42 -06009098 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009100 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009101 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009102 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009103 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009104 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009105 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009106 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009107 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009108 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009109 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009110 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009112 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009113 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009114 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009115 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009116 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009118 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009119 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009120 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009121 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009122 break;
9123 default:
9124 assert(0);
9125 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009127 if (unicode != NULL && thousands_sep_kind != kind) {
9128 if (thousands_sep_kind < kind)
9129 PyMem_Free(thousands_sep_data);
9130 else
9131 PyMem_Free(data);
9132 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009133 if (unicode == NULL) {
9134 *maxchar = 127;
9135 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009136 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009137 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009138 }
9139 }
9140 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141}
9142
9143
Alexander Belopolsky40018472011-02-26 01:02:56 +00009144Py_ssize_t
9145PyUnicode_Count(PyObject *str,
9146 PyObject *substr,
9147 Py_ssize_t start,
9148 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009150 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009151 PyObject* str_obj;
9152 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009153 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 void *buf1 = NULL, *buf2 = NULL;
9155 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009156
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009157 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009158 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009160 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009161 if (!sub_obj) {
9162 Py_DECREF(str_obj);
9163 return -1;
9164 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009165 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009166 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 Py_DECREF(str_obj);
9168 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169 }
Tim Petersced69f82003-09-16 20:30:58 +00009170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 kind1 = PyUnicode_KIND(str_obj);
9172 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009173 if (kind1 < kind2) {
9174 Py_DECREF(sub_obj);
9175 Py_DECREF(str_obj);
9176 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009177 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 len1 = PyUnicode_GET_LENGTH(str_obj);
9180 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009182 if (end - start < len2) {
9183 Py_DECREF(sub_obj);
9184 Py_DECREF(str_obj);
9185 return 0;
9186 }
9187
9188 buf1 = PyUnicode_DATA(str_obj);
9189 buf2 = PyUnicode_DATA(sub_obj);
9190 if (kind2 != kind1) {
9191 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9192 if (!buf2)
9193 goto onError;
9194 }
9195
9196 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009198 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9199 result = asciilib_count(
9200 ((Py_UCS1*)buf1) + start, end - start,
9201 buf2, len2, PY_SSIZE_T_MAX
9202 );
9203 else
9204 result = ucs1lib_count(
9205 ((Py_UCS1*)buf1) + start, end - start,
9206 buf2, len2, PY_SSIZE_T_MAX
9207 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 break;
9209 case PyUnicode_2BYTE_KIND:
9210 result = ucs2lib_count(
9211 ((Py_UCS2*)buf1) + start, end - start,
9212 buf2, len2, PY_SSIZE_T_MAX
9213 );
9214 break;
9215 case PyUnicode_4BYTE_KIND:
9216 result = ucs4lib_count(
9217 ((Py_UCS4*)buf1) + start, end - start,
9218 buf2, len2, PY_SSIZE_T_MAX
9219 );
9220 break;
9221 default:
9222 assert(0); result = 0;
9223 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224
9225 Py_DECREF(sub_obj);
9226 Py_DECREF(str_obj);
9227
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009228 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 PyMem_Free(buf2);
9230
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 onError:
9233 Py_DECREF(sub_obj);
9234 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009235 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 PyMem_Free(buf2);
9237 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238}
9239
Alexander Belopolsky40018472011-02-26 01:02:56 +00009240Py_ssize_t
9241PyUnicode_Find(PyObject *str,
9242 PyObject *sub,
9243 Py_ssize_t start,
9244 Py_ssize_t end,
9245 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009247 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009248
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009250 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009251 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009252 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009253 if (!sub) {
9254 Py_DECREF(str);
9255 return -2;
9256 }
9257 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9258 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009259 Py_DECREF(str);
9260 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261 }
Tim Petersced69f82003-09-16 20:30:58 +00009262
Victor Stinner794d5672011-10-10 03:21:36 +02009263 result = any_find_slice(direction,
9264 str, sub, start, end
9265 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009266
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009268 Py_DECREF(sub);
9269
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 return result;
9271}
9272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273Py_ssize_t
9274PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9275 Py_ssize_t start, Py_ssize_t end,
9276 int direction)
9277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009279 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 if (PyUnicode_READY(str) == -1)
9281 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009282 if (start < 0 || end < 0) {
9283 PyErr_SetString(PyExc_IndexError, "string index out of range");
9284 return -2;
9285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 if (end > PyUnicode_GET_LENGTH(str))
9287 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009288 if (start >= end)
9289 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009291 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9292 kind, end-start, ch, direction);
9293 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009295 else
9296 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297}
9298
Alexander Belopolsky40018472011-02-26 01:02:56 +00009299static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009300tailmatch(PyObject *self,
9301 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009302 Py_ssize_t start,
9303 Py_ssize_t end,
9304 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 int kind_self;
9307 int kind_sub;
9308 void *data_self;
9309 void *data_sub;
9310 Py_ssize_t offset;
9311 Py_ssize_t i;
9312 Py_ssize_t end_sub;
9313
9314 if (PyUnicode_READY(self) == -1 ||
9315 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009316 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317
9318 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 return 1;
9320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9322 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009324 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 kind_self = PyUnicode_KIND(self);
9327 data_self = PyUnicode_DATA(self);
9328 kind_sub = PyUnicode_KIND(substring);
9329 data_sub = PyUnicode_DATA(substring);
9330 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9331
9332 if (direction > 0)
9333 offset = end;
9334 else
9335 offset = start;
9336
9337 if (PyUnicode_READ(kind_self, data_self, offset) ==
9338 PyUnicode_READ(kind_sub, data_sub, 0) &&
9339 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9340 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9341 /* If both are of the same kind, memcmp is sufficient */
9342 if (kind_self == kind_sub) {
9343 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009344 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 data_sub,
9346 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009347 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 }
9349 /* otherwise we have to compare each character by first accesing it */
9350 else {
9351 /* We do not need to compare 0 and len(substring)-1 because
9352 the if statement above ensured already that they are equal
9353 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 for (i = 1; i < end_sub; ++i) {
9355 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9356 PyUnicode_READ(kind_sub, data_sub, i))
9357 return 0;
9358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 }
9362
9363 return 0;
9364}
9365
Alexander Belopolsky40018472011-02-26 01:02:56 +00009366Py_ssize_t
9367PyUnicode_Tailmatch(PyObject *str,
9368 PyObject *substr,
9369 Py_ssize_t start,
9370 Py_ssize_t end,
9371 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009373 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009374
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 str = PyUnicode_FromObject(str);
9376 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009377 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 substr = PyUnicode_FromObject(substr);
9379 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 Py_DECREF(str);
9381 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Tim Petersced69f82003-09-16 20:30:58 +00009383
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009384 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 Py_DECREF(str);
9387 Py_DECREF(substr);
9388 return result;
9389}
9390
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391/* Apply fixfct filter to the Unicode object self and return a
9392 reference to the modified object */
9393
Alexander Belopolsky40018472011-02-26 01:02:56 +00009394static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009395fixup(PyObject *self,
9396 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 PyObject *u;
9399 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009400 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009402 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009405 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 /* fix functions return the new maximum character in a string,
9408 if the kind of the resulting unicode object does not change,
9409 everything is fine. Otherwise we need to change the string kind
9410 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009411 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009412
9413 if (maxchar_new == 0) {
9414 /* no changes */;
9415 if (PyUnicode_CheckExact(self)) {
9416 Py_DECREF(u);
9417 Py_INCREF(self);
9418 return self;
9419 }
9420 else
9421 return u;
9422 }
9423
Victor Stinnere6abb482012-05-02 01:15:40 +02009424 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425
Victor Stinnereaab6042011-12-11 22:22:39 +01009426 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009428
9429 /* In case the maximum character changed, we need to
9430 convert the string to the new category. */
9431 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9432 if (v == NULL) {
9433 Py_DECREF(u);
9434 return NULL;
9435 }
9436 if (maxchar_new > maxchar_old) {
9437 /* If the maxchar increased so that the kind changed, not all
9438 characters are representable anymore and we need to fix the
9439 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009440 _PyUnicode_FastCopyCharacters(v, 0,
9441 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009442 maxchar_old = fixfct(v);
9443 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 }
9445 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009446 _PyUnicode_FastCopyCharacters(v, 0,
9447 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009449 Py_DECREF(u);
9450 assert(_PyUnicode_CheckConsistency(v, 1));
9451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452}
9453
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009454static PyObject *
9455ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009457 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9458 char *resdata, *data = PyUnicode_DATA(self);
9459 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009460
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461 res = PyUnicode_New(len, 127);
9462 if (res == NULL)
9463 return NULL;
9464 resdata = PyUnicode_DATA(res);
9465 if (lower)
9466 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 _Py_bytes_upper(resdata, data, len);
9469 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009473handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009475 Py_ssize_t j;
9476 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009477 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009478 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009479
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9481
9482 where ! is a negation and \p{xxx} is a character with property xxx.
9483 */
9484 for (j = i - 1; j >= 0; j--) {
9485 c = PyUnicode_READ(kind, data, j);
9486 if (!_PyUnicode_IsCaseIgnorable(c))
9487 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009489 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9490 if (final_sigma) {
9491 for (j = i + 1; j < length; j++) {
9492 c = PyUnicode_READ(kind, data, j);
9493 if (!_PyUnicode_IsCaseIgnorable(c))
9494 break;
9495 }
9496 final_sigma = j == length || !_PyUnicode_IsCased(c);
9497 }
9498 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499}
9500
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009501static int
9502lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9503 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505 /* Obscure special case. */
9506 if (c == 0x3A3) {
9507 mapped[0] = handle_capital_sigma(kind, data, length, i);
9508 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009510 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511}
9512
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513static Py_ssize_t
9514do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009516 Py_ssize_t i, k = 0;
9517 int n_res, j;
9518 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009519
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009520 c = PyUnicode_READ(kind, data, 0);
9521 n_res = _PyUnicode_ToUpperFull(c, mapped);
9522 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009523 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009524 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009526 for (i = 1; i < length; i++) {
9527 c = PyUnicode_READ(kind, data, i);
9528 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9529 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009530 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009531 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009532 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009533 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009534 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535}
9536
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009537static Py_ssize_t
9538do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9539 Py_ssize_t i, k = 0;
9540
9541 for (i = 0; i < length; i++) {
9542 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9543 int n_res, j;
9544 if (Py_UNICODE_ISUPPER(c)) {
9545 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9546 }
9547 else if (Py_UNICODE_ISLOWER(c)) {
9548 n_res = _PyUnicode_ToUpperFull(c, mapped);
9549 }
9550 else {
9551 n_res = 1;
9552 mapped[0] = c;
9553 }
9554 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009555 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009556 res[k++] = mapped[j];
9557 }
9558 }
9559 return k;
9560}
9561
9562static Py_ssize_t
9563do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9564 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009566 Py_ssize_t i, k = 0;
9567
9568 for (i = 0; i < length; i++) {
9569 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9570 int n_res, j;
9571 if (lower)
9572 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9573 else
9574 n_res = _PyUnicode_ToUpperFull(c, mapped);
9575 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009576 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009577 res[k++] = mapped[j];
9578 }
9579 }
9580 return k;
9581}
9582
9583static Py_ssize_t
9584do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9585{
9586 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9587}
9588
9589static Py_ssize_t
9590do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9591{
9592 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9593}
9594
Benjamin Petersone51757f2012-01-12 21:10:29 -05009595static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009596do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9597{
9598 Py_ssize_t i, k = 0;
9599
9600 for (i = 0; i < length; i++) {
9601 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9602 Py_UCS4 mapped[3];
9603 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9604 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009605 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009606 res[k++] = mapped[j];
9607 }
9608 }
9609 return k;
9610}
9611
9612static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009613do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9614{
9615 Py_ssize_t i, k = 0;
9616 int previous_is_cased;
9617
9618 previous_is_cased = 0;
9619 for (i = 0; i < length; i++) {
9620 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9621 Py_UCS4 mapped[3];
9622 int n_res, j;
9623
9624 if (previous_is_cased)
9625 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9626 else
9627 n_res = _PyUnicode_ToTitleFull(c, mapped);
9628
9629 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009630 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009631 res[k++] = mapped[j];
9632 }
9633
9634 previous_is_cased = _PyUnicode_IsCased(c);
9635 }
9636 return k;
9637}
9638
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639static PyObject *
9640case_operation(PyObject *self,
9641 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9642{
9643 PyObject *res = NULL;
9644 Py_ssize_t length, newlength = 0;
9645 int kind, outkind;
9646 void *data, *outdata;
9647 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9648
Benjamin Petersoneea48462012-01-16 14:28:50 -05009649 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650
9651 kind = PyUnicode_KIND(self);
9652 data = PyUnicode_DATA(self);
9653 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009654 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009655 PyErr_SetString(PyExc_OverflowError, "string is too long");
9656 return NULL;
9657 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009658 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659 if (tmp == NULL)
9660 return PyErr_NoMemory();
9661 newlength = perform(kind, data, length, tmp, &maxchar);
9662 res = PyUnicode_New(newlength, maxchar);
9663 if (res == NULL)
9664 goto leave;
9665 tmpend = tmp + newlength;
9666 outdata = PyUnicode_DATA(res);
9667 outkind = PyUnicode_KIND(res);
9668 switch (outkind) {
9669 case PyUnicode_1BYTE_KIND:
9670 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9671 break;
9672 case PyUnicode_2BYTE_KIND:
9673 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9674 break;
9675 case PyUnicode_4BYTE_KIND:
9676 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9677 break;
9678 default:
9679 assert(0);
9680 break;
9681 }
9682 leave:
9683 PyMem_FREE(tmp);
9684 return res;
9685}
9686
Tim Peters8ce9f162004-08-27 01:49:32 +00009687PyObject *
9688PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009691 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009693 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009694 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9695 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009696 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009698 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009700 int use_memcpy;
9701 unsigned char *res_data = NULL, *sep_data = NULL;
9702 PyObject *last_obj;
9703 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009705 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009706 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009708 }
9709
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009710 /* NOTE: the following code can't call back into Python code,
9711 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009712 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009713
Tim Peters05eba1f2004-08-27 21:32:02 +00009714 seqlen = PySequence_Fast_GET_SIZE(fseq);
9715 /* If empty sequence, return u"". */
9716 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009717 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009718 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009719 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009720
Tim Peters05eba1f2004-08-27 21:32:02 +00009721 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009722 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009723 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009724 if (seqlen == 1) {
9725 if (PyUnicode_CheckExact(items[0])) {
9726 res = items[0];
9727 Py_INCREF(res);
9728 Py_DECREF(fseq);
9729 return res;
9730 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009731 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009732 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009733 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009734 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009735 /* Set up sep and seplen */
9736 if (separator == NULL) {
9737 /* fall back to a blank space separator */
9738 sep = PyUnicode_FromOrdinal(' ');
9739 if (!sep)
9740 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009741 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009742 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009743 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009744 else {
9745 if (!PyUnicode_Check(separator)) {
9746 PyErr_Format(PyExc_TypeError,
9747 "separator: expected str instance,"
9748 " %.80s found",
9749 Py_TYPE(separator)->tp_name);
9750 goto onError;
9751 }
9752 if (PyUnicode_READY(separator))
9753 goto onError;
9754 sep = separator;
9755 seplen = PyUnicode_GET_LENGTH(separator);
9756 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9757 /* inc refcount to keep this code path symmetric with the
9758 above case of a blank separator */
9759 Py_INCREF(sep);
9760 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009761 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009762 }
9763
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009764 /* There are at least two things to join, or else we have a subclass
9765 * of str in the sequence.
9766 * Do a pre-pass to figure out the total amount of space we'll
9767 * need (sz), and see whether all argument are strings.
9768 */
9769 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009770#ifdef Py_DEBUG
9771 use_memcpy = 0;
9772#else
9773 use_memcpy = 1;
9774#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009775 for (i = 0; i < seqlen; i++) {
9776 const Py_ssize_t old_sz = sz;
9777 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009778 if (!PyUnicode_Check(item)) {
9779 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009780 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 " %.80s found",
9782 i, Py_TYPE(item)->tp_name);
9783 goto onError;
9784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 if (PyUnicode_READY(item) == -1)
9786 goto onError;
9787 sz += PyUnicode_GET_LENGTH(item);
9788 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009789 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009790 if (i != 0)
9791 sz += seplen;
9792 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9793 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009794 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009795 goto onError;
9796 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009797 if (use_memcpy && last_obj != NULL) {
9798 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9799 use_memcpy = 0;
9800 }
9801 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009802 }
Tim Petersced69f82003-09-16 20:30:58 +00009803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009805 if (res == NULL)
9806 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009807
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009808 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009809#ifdef Py_DEBUG
9810 use_memcpy = 0;
9811#else
9812 if (use_memcpy) {
9813 res_data = PyUnicode_1BYTE_DATA(res);
9814 kind = PyUnicode_KIND(res);
9815 if (seplen != 0)
9816 sep_data = PyUnicode_1BYTE_DATA(sep);
9817 }
9818#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009819 if (use_memcpy) {
9820 for (i = 0; i < seqlen; ++i) {
9821 Py_ssize_t itemlen;
9822 item = items[i];
9823
9824 /* Copy item, and maybe the separator. */
9825 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009826 Py_MEMCPY(res_data,
9827 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009828 kind * seplen);
9829 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009830 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009831
9832 itemlen = PyUnicode_GET_LENGTH(item);
9833 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009834 Py_MEMCPY(res_data,
9835 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009836 kind * itemlen);
9837 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009838 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009839 }
9840 assert(res_data == PyUnicode_1BYTE_DATA(res)
9841 + kind * PyUnicode_GET_LENGTH(res));
9842 }
9843 else {
9844 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9845 Py_ssize_t itemlen;
9846 item = items[i];
9847
9848 /* Copy item, and maybe the separator. */
9849 if (i && seplen != 0) {
9850 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9851 res_offset += seplen;
9852 }
9853
9854 itemlen = PyUnicode_GET_LENGTH(item);
9855 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009856 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009857 res_offset += itemlen;
9858 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009859 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009860 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009861 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009862
Tim Peters05eba1f2004-08-27 21:32:02 +00009863 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009865 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867
Benjamin Peterson29060642009-01-31 22:14:21 +00009868 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009869 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009871 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872 return NULL;
9873}
9874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875#define FILL(kind, data, value, start, length) \
9876 do { \
9877 Py_ssize_t i_ = 0; \
9878 assert(kind != PyUnicode_WCHAR_KIND); \
9879 switch ((kind)) { \
9880 case PyUnicode_1BYTE_KIND: { \
9881 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009882 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 break; \
9884 } \
9885 case PyUnicode_2BYTE_KIND: { \
9886 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9887 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9888 break; \
9889 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009890 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9892 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9893 break; \
9894 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009895 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 } \
9897 } while (0)
9898
Victor Stinnerd3f08822012-05-29 12:57:52 +02009899void
9900_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9901 Py_UCS4 fill_char)
9902{
9903 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9904 const void *data = PyUnicode_DATA(unicode);
9905 assert(PyUnicode_IS_READY(unicode));
9906 assert(unicode_modifiable(unicode));
9907 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9908 assert(start >= 0);
9909 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9910 FILL(kind, data, fill_char, start, length);
9911}
9912
Victor Stinner3fe55312012-01-04 00:33:50 +01009913Py_ssize_t
9914PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9915 Py_UCS4 fill_char)
9916{
9917 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009918
9919 if (!PyUnicode_Check(unicode)) {
9920 PyErr_BadInternalCall();
9921 return -1;
9922 }
9923 if (PyUnicode_READY(unicode) == -1)
9924 return -1;
9925 if (unicode_check_modifiable(unicode))
9926 return -1;
9927
Victor Stinnerd3f08822012-05-29 12:57:52 +02009928 if (start < 0) {
9929 PyErr_SetString(PyExc_IndexError, "string index out of range");
9930 return -1;
9931 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009932 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9933 PyErr_SetString(PyExc_ValueError,
9934 "fill character is bigger than "
9935 "the string maximum character");
9936 return -1;
9937 }
9938
9939 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9940 length = Py_MIN(maxlen, length);
9941 if (length <= 0)
9942 return 0;
9943
Victor Stinnerd3f08822012-05-29 12:57:52 +02009944 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009945 return length;
9946}
9947
Victor Stinner9310abb2011-10-05 00:59:23 +02009948static PyObject *
9949pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009950 Py_ssize_t left,
9951 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 PyObject *u;
9955 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009956 int kind;
9957 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958
9959 if (left < 0)
9960 left = 0;
9961 if (right < 0)
9962 right = 0;
9963
Victor Stinnerc4b49542011-12-11 22:44:26 +01009964 if (left == 0 && right == 0)
9965 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9968 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009969 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9970 return NULL;
9971 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009973 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009975 if (!u)
9976 return NULL;
9977
9978 kind = PyUnicode_KIND(u);
9979 data = PyUnicode_DATA(u);
9980 if (left)
9981 FILL(kind, data, fill, 0, left);
9982 if (right)
9983 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009984 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009985 assert(_PyUnicode_CheckConsistency(u, 1));
9986 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987}
9988
Alexander Belopolsky40018472011-02-26 01:02:56 +00009989PyObject *
9990PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
9994 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009995 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009997 if (PyUnicode_READY(string) == -1) {
9998 Py_DECREF(string);
9999 return NULL;
10000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
Benjamin Petersonead6b532011-12-20 17:23:42 -060010002 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 if (PyUnicode_IS_ASCII(string))
10005 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010006 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010007 PyUnicode_GET_LENGTH(string), keepends);
10008 else
10009 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010010 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010011 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 break;
10013 case PyUnicode_2BYTE_KIND:
10014 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010015 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 PyUnicode_GET_LENGTH(string), keepends);
10017 break;
10018 case PyUnicode_4BYTE_KIND:
10019 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010020 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 PyUnicode_GET_LENGTH(string), keepends);
10022 break;
10023 default:
10024 assert(0);
10025 list = 0;
10026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 Py_DECREF(string);
10028 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029}
10030
Alexander Belopolsky40018472011-02-26 01:02:56 +000010031static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010032split(PyObject *self,
10033 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010034 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010036 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 void *buf1, *buf2;
10038 Py_ssize_t len1, len2;
10039 PyObject* out;
10040
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010042 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 if (PyUnicode_READY(self) == -1)
10045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010048 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010050 if (PyUnicode_IS_ASCII(self))
10051 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010052 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053 PyUnicode_GET_LENGTH(self), maxcount
10054 );
10055 else
10056 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010057 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 PyUnicode_GET_LENGTH(self), maxcount
10059 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 case PyUnicode_2BYTE_KIND:
10061 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010062 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 PyUnicode_GET_LENGTH(self), maxcount
10064 );
10065 case PyUnicode_4BYTE_KIND:
10066 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010067 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 PyUnicode_GET_LENGTH(self), maxcount
10069 );
10070 default:
10071 assert(0);
10072 return NULL;
10073 }
10074
10075 if (PyUnicode_READY(substring) == -1)
10076 return NULL;
10077
10078 kind1 = PyUnicode_KIND(self);
10079 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 len1 = PyUnicode_GET_LENGTH(self);
10081 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010082 if (kind1 < kind2 || len1 < len2) {
10083 out = PyList_New(1);
10084 if (out == NULL)
10085 return NULL;
10086 Py_INCREF(self);
10087 PyList_SET_ITEM(out, 0, self);
10088 return out;
10089 }
10090 buf1 = PyUnicode_DATA(self);
10091 buf2 = PyUnicode_DATA(substring);
10092 if (kind2 != kind1) {
10093 buf2 = _PyUnicode_AsKind(substring, kind1);
10094 if (!buf2)
10095 return NULL;
10096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010098 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010100 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10101 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010102 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010103 else
10104 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010105 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 break;
10107 case PyUnicode_2BYTE_KIND:
10108 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010109 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 break;
10111 case PyUnicode_4BYTE_KIND:
10112 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010113 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 break;
10115 default:
10116 out = NULL;
10117 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010118 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 PyMem_Free(buf2);
10120 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121}
10122
Alexander Belopolsky40018472011-02-26 01:02:56 +000010123static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010124rsplit(PyObject *self,
10125 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010126 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010127{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010128 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 void *buf1, *buf2;
10130 Py_ssize_t len1, len2;
10131 PyObject* out;
10132
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010133 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010134 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 if (PyUnicode_READY(self) == -1)
10137 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010140 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 if (PyUnicode_IS_ASCII(self))
10143 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010144 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010145 PyUnicode_GET_LENGTH(self), maxcount
10146 );
10147 else
10148 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010149 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 PyUnicode_GET_LENGTH(self), maxcount
10151 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 case PyUnicode_2BYTE_KIND:
10153 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010154 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 PyUnicode_GET_LENGTH(self), maxcount
10156 );
10157 case PyUnicode_4BYTE_KIND:
10158 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 PyUnicode_GET_LENGTH(self), maxcount
10161 );
10162 default:
10163 assert(0);
10164 return NULL;
10165 }
10166
10167 if (PyUnicode_READY(substring) == -1)
10168 return NULL;
10169
10170 kind1 = PyUnicode_KIND(self);
10171 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 len1 = PyUnicode_GET_LENGTH(self);
10173 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010174 if (kind1 < kind2 || len1 < len2) {
10175 out = PyList_New(1);
10176 if (out == NULL)
10177 return NULL;
10178 Py_INCREF(self);
10179 PyList_SET_ITEM(out, 0, self);
10180 return out;
10181 }
10182 buf1 = PyUnicode_DATA(self);
10183 buf2 = PyUnicode_DATA(substring);
10184 if (kind2 != kind1) {
10185 buf2 = _PyUnicode_AsKind(substring, kind1);
10186 if (!buf2)
10187 return NULL;
10188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010190 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10193 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 else
10196 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 break;
10199 case PyUnicode_2BYTE_KIND:
10200 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 break;
10203 case PyUnicode_4BYTE_KIND:
10204 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010205 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 break;
10207 default:
10208 out = NULL;
10209 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010210 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 PyMem_Free(buf2);
10212 return out;
10213}
10214
10215static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10217 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010219 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010221 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10222 return asciilib_find(buf1, len1, buf2, len2, offset);
10223 else
10224 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 case PyUnicode_2BYTE_KIND:
10226 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10227 case PyUnicode_4BYTE_KIND:
10228 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10229 }
10230 assert(0);
10231 return -1;
10232}
10233
10234static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10236 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010238 switch (kind) {
10239 case PyUnicode_1BYTE_KIND:
10240 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10241 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10242 else
10243 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10244 case PyUnicode_2BYTE_KIND:
10245 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10246 case PyUnicode_4BYTE_KIND:
10247 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10248 }
10249 assert(0);
10250 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010251}
10252
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010253static void
10254replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10255 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10256{
10257 int kind = PyUnicode_KIND(u);
10258 void *data = PyUnicode_DATA(u);
10259 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10260 if (kind == PyUnicode_1BYTE_KIND) {
10261 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10262 (Py_UCS1 *)data + len,
10263 u1, u2, maxcount);
10264 }
10265 else if (kind == PyUnicode_2BYTE_KIND) {
10266 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10267 (Py_UCS2 *)data + len,
10268 u1, u2, maxcount);
10269 }
10270 else {
10271 assert(kind == PyUnicode_4BYTE_KIND);
10272 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10273 (Py_UCS4 *)data + len,
10274 u1, u2, maxcount);
10275 }
10276}
10277
Alexander Belopolsky40018472011-02-26 01:02:56 +000010278static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279replace(PyObject *self, PyObject *str1,
10280 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 PyObject *u;
10283 char *sbuf = PyUnicode_DATA(self);
10284 char *buf1 = PyUnicode_DATA(str1);
10285 char *buf2 = PyUnicode_DATA(str2);
10286 int srelease = 0, release1 = 0, release2 = 0;
10287 int skind = PyUnicode_KIND(self);
10288 int kind1 = PyUnicode_KIND(str1);
10289 int kind2 = PyUnicode_KIND(str2);
10290 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10291 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10292 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010293 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010294 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
10296 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010297 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010299 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300
Victor Stinner59de0ee2011-10-07 10:01:28 +020010301 if (str1 == str2)
10302 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303
Victor Stinner49a0a212011-10-12 23:46:10 +020010304 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010305 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10306 if (maxchar < maxchar_str1)
10307 /* substring too wide to be present */
10308 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010309 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10310 /* Replacing str1 with str2 may cause a maxchar reduction in the
10311 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010312 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010313 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010318 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010320 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010321 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010322 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010323
Victor Stinner69ed0f42013-04-09 21:48:24 +020010324 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010325 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010326 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010328 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010330 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010332
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010333 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10334 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010335 }
10336 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 int rkind = skind;
10338 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010339 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (kind1 < rkind) {
10342 /* widen substring */
10343 buf1 = _PyUnicode_AsKind(str1, rkind);
10344 if (!buf1) goto error;
10345 release1 = 1;
10346 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010348 if (i < 0)
10349 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (rkind > kind2) {
10351 /* widen replacement */
10352 buf2 = _PyUnicode_AsKind(str2, rkind);
10353 if (!buf2) goto error;
10354 release2 = 1;
10355 }
10356 else if (rkind < kind2) {
10357 /* widen self and buf1 */
10358 rkind = kind2;
10359 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010360 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 sbuf = _PyUnicode_AsKind(self, rkind);
10362 if (!sbuf) goto error;
10363 srelease = 1;
10364 buf1 = _PyUnicode_AsKind(str1, rkind);
10365 if (!buf1) goto error;
10366 release1 = 1;
10367 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010368 u = PyUnicode_New(slen, maxchar);
10369 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010371 assert(PyUnicode_KIND(u) == rkind);
10372 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010373
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010374 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010375 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010376 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010378 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010380
10381 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010383 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010384 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010385 if (i == -1)
10386 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010387 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010389 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010393 }
10394 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010396 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 int rkind = skind;
10398 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010401 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 buf1 = _PyUnicode_AsKind(str1, rkind);
10403 if (!buf1) goto error;
10404 release1 = 1;
10405 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010406 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010407 if (n == 0)
10408 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010410 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 buf2 = _PyUnicode_AsKind(str2, rkind);
10412 if (!buf2) goto error;
10413 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010416 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 rkind = kind2;
10418 sbuf = _PyUnicode_AsKind(self, rkind);
10419 if (!sbuf) goto error;
10420 srelease = 1;
10421 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010422 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 buf1 = _PyUnicode_AsKind(str1, rkind);
10424 if (!buf1) goto error;
10425 release1 = 1;
10426 }
10427 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10428 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010429 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 PyErr_SetString(PyExc_OverflowError,
10431 "replace string is too long");
10432 goto error;
10433 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010434 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010435 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010436 _Py_INCREF_UNICODE_EMPTY();
10437 if (!unicode_empty)
10438 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010439 u = unicode_empty;
10440 goto done;
10441 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010442 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 PyErr_SetString(PyExc_OverflowError,
10444 "replace string is too long");
10445 goto error;
10446 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010447 u = PyUnicode_New(new_size, maxchar);
10448 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010450 assert(PyUnicode_KIND(u) == rkind);
10451 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 ires = i = 0;
10453 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 while (n-- > 0) {
10455 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010456 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010457 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010458 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010459 if (j == -1)
10460 break;
10461 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010462 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 memcpy(res + rkind * ires,
10464 sbuf + rkind * i,
10465 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 }
10468 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010470 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010472 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010478 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010479 memcpy(res + rkind * ires,
10480 sbuf + rkind * i,
10481 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010482 }
10483 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 /* interleave */
10485 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010486 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010488 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010490 if (--n <= 0)
10491 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010492 memcpy(res + rkind * ires,
10493 sbuf + rkind * i,
10494 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 ires++;
10496 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010498 memcpy(res + rkind * ires,
10499 sbuf + rkind * i,
10500 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 }
10503
10504 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010505 unicode_adjust_maxchar(&u);
10506 if (u == NULL)
10507 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010509
10510 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (srelease)
10512 PyMem_FREE(sbuf);
10513 if (release1)
10514 PyMem_FREE(buf1);
10515 if (release2)
10516 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010517 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (srelease)
10523 PyMem_FREE(sbuf);
10524 if (release1)
10525 PyMem_FREE(buf1);
10526 if (release2)
10527 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010528 return unicode_result_unchanged(self);
10529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 error:
10531 if (srelease && sbuf)
10532 PyMem_FREE(sbuf);
10533 if (release1 && buf1)
10534 PyMem_FREE(buf1);
10535 if (release2 && buf2)
10536 PyMem_FREE(buf2);
10537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538}
10539
10540/* --- Unicode Object Methods --------------------------------------------- */
10541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010542PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544\n\
10545Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010546characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547
10548static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010549unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010551 if (PyUnicode_READY(self) == -1)
10552 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010553 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554}
10555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010556PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558\n\
10559Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010560have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561
10562static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010563unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010565 if (PyUnicode_READY(self) == -1)
10566 return NULL;
10567 if (PyUnicode_GET_LENGTH(self) == 0)
10568 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010569 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570}
10571
Benjamin Petersond5890c82012-01-14 13:23:30 -050010572PyDoc_STRVAR(casefold__doc__,
10573 "S.casefold() -> str\n\
10574\n\
10575Return a version of S suitable for caseless comparisons.");
10576
10577static PyObject *
10578unicode_casefold(PyObject *self)
10579{
10580 if (PyUnicode_READY(self) == -1)
10581 return NULL;
10582 if (PyUnicode_IS_ASCII(self))
10583 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010584 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010585}
10586
10587
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010588/* Argument converter. Coerces to a single unicode character */
10589
10590static int
10591convert_uc(PyObject *obj, void *addr)
10592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010594 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010595
Benjamin Peterson14339b62009-01-31 16:36:08 +000010596 uniobj = PyUnicode_FromObject(obj);
10597 if (uniobj == NULL) {
10598 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010600 return 0;
10601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010603 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010604 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010605 Py_DECREF(uniobj);
10606 return 0;
10607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010609 Py_DECREF(uniobj);
10610 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010611}
10612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010613PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010614 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010616Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010617done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618
10619static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010620unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010622 Py_ssize_t marg, left;
10623 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 Py_UCS4 fillchar = ' ';
10625
Victor Stinnere9a29352011-10-01 02:14:59 +020010626 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628
Benjamin Petersonbac79492012-01-14 13:34:47 -050010629 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 return NULL;
10631
Victor Stinnerc4b49542011-12-11 22:44:26 +010010632 if (PyUnicode_GET_LENGTH(self) >= width)
10633 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634
Victor Stinnerc4b49542011-12-11 22:44:26 +010010635 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636 left = marg / 2 + (marg & width & 1);
10637
Victor Stinner9310abb2011-10-05 00:59:23 +020010638 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639}
10640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641/* This function assumes that str1 and str2 are readied by the caller. */
10642
Marc-André Lemburge5034372000-08-08 08:04:29 +000010643static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010644unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010645{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010646#define COMPARE(TYPE1, TYPE2) \
10647 do { \
10648 TYPE1* p1 = (TYPE1 *)data1; \
10649 TYPE2* p2 = (TYPE2 *)data2; \
10650 TYPE1* end = p1 + len; \
10651 Py_UCS4 c1, c2; \
10652 for (; p1 != end; p1++, p2++) { \
10653 c1 = *p1; \
10654 c2 = *p2; \
10655 if (c1 != c2) \
10656 return (c1 < c2) ? -1 : 1; \
10657 } \
10658 } \
10659 while (0)
10660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 int kind1, kind2;
10662 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010663 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 kind1 = PyUnicode_KIND(str1);
10666 kind2 = PyUnicode_KIND(str2);
10667 data1 = PyUnicode_DATA(str1);
10668 data2 = PyUnicode_DATA(str2);
10669 len1 = PyUnicode_GET_LENGTH(str1);
10670 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010671 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010672
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010673 switch(kind1) {
10674 case PyUnicode_1BYTE_KIND:
10675 {
10676 switch(kind2) {
10677 case PyUnicode_1BYTE_KIND:
10678 {
10679 int cmp = memcmp(data1, data2, len);
10680 /* normalize result of memcmp() into the range [-1; 1] */
10681 if (cmp < 0)
10682 return -1;
10683 if (cmp > 0)
10684 return 1;
10685 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010686 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010687 case PyUnicode_2BYTE_KIND:
10688 COMPARE(Py_UCS1, Py_UCS2);
10689 break;
10690 case PyUnicode_4BYTE_KIND:
10691 COMPARE(Py_UCS1, Py_UCS4);
10692 break;
10693 default:
10694 assert(0);
10695 }
10696 break;
10697 }
10698 case PyUnicode_2BYTE_KIND:
10699 {
10700 switch(kind2) {
10701 case PyUnicode_1BYTE_KIND:
10702 COMPARE(Py_UCS2, Py_UCS1);
10703 break;
10704 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010705 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010706 COMPARE(Py_UCS2, Py_UCS2);
10707 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010708 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010709 case PyUnicode_4BYTE_KIND:
10710 COMPARE(Py_UCS2, Py_UCS4);
10711 break;
10712 default:
10713 assert(0);
10714 }
10715 break;
10716 }
10717 case PyUnicode_4BYTE_KIND:
10718 {
10719 switch(kind2) {
10720 case PyUnicode_1BYTE_KIND:
10721 COMPARE(Py_UCS4, Py_UCS1);
10722 break;
10723 case PyUnicode_2BYTE_KIND:
10724 COMPARE(Py_UCS4, Py_UCS2);
10725 break;
10726 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010727 {
10728#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10729 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10730 /* normalize result of wmemcmp() into the range [-1; 1] */
10731 if (cmp < 0)
10732 return -1;
10733 if (cmp > 0)
10734 return 1;
10735#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010736 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010737#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010738 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010739 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010740 default:
10741 assert(0);
10742 }
10743 break;
10744 }
10745 default:
10746 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010747 }
10748
Victor Stinner770e19e2012-10-04 22:59:45 +020010749 if (len1 == len2)
10750 return 0;
10751 if (len1 < len2)
10752 return -1;
10753 else
10754 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010755
10756#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010757}
10758
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010759Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010760unicode_compare_eq(PyObject *str1, PyObject *str2)
10761{
10762 int kind;
10763 void *data1, *data2;
10764 Py_ssize_t len;
10765 int cmp;
10766
Victor Stinnere5567ad2012-10-23 02:48:49 +020010767 len = PyUnicode_GET_LENGTH(str1);
10768 if (PyUnicode_GET_LENGTH(str2) != len)
10769 return 0;
10770 kind = PyUnicode_KIND(str1);
10771 if (PyUnicode_KIND(str2) != kind)
10772 return 0;
10773 data1 = PyUnicode_DATA(str1);
10774 data2 = PyUnicode_DATA(str2);
10775
10776 cmp = memcmp(data1, data2, len * kind);
10777 return (cmp == 0);
10778}
10779
10780
Alexander Belopolsky40018472011-02-26 01:02:56 +000010781int
10782PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10785 if (PyUnicode_READY(left) == -1 ||
10786 PyUnicode_READY(right) == -1)
10787 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010788
10789 /* a string is equal to itself */
10790 if (left == right)
10791 return 0;
10792
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010793 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010795 PyErr_Format(PyExc_TypeError,
10796 "Can't compare %.100s and %.100s",
10797 left->ob_type->tp_name,
10798 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 return -1;
10800}
10801
Martin v. Löwis5b222132007-06-10 09:51:05 +000010802int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010803_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10804{
10805 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10806 if (right_str == NULL)
10807 return -1;
10808 return PyUnicode_Compare(left, right_str);
10809}
10810
10811int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010812PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 Py_ssize_t i;
10815 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 Py_UCS4 chr;
10817
Victor Stinner910337b2011-10-03 03:20:16 +020010818 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (PyUnicode_READY(uni) == -1)
10820 return -1;
10821 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010822 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010823 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010824 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010825 size_t len, len2 = strlen(str);
10826 int cmp;
10827
10828 len = Py_MIN(len1, len2);
10829 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010830 if (cmp != 0) {
10831 if (cmp < 0)
10832 return -1;
10833 else
10834 return 1;
10835 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010836 if (len1 > len2)
10837 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010838 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010839 return -1; /* str is longer */
10840 return 0;
10841 }
10842 else {
10843 void *data = PyUnicode_DATA(uni);
10844 /* Compare Unicode string and source character set string */
10845 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010846 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010847 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10848 /* This check keeps Python strings that end in '\0' from comparing equal
10849 to C strings identical up to that point. */
10850 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10851 return 1; /* uni is longer */
10852 if (str[i])
10853 return -1; /* str is longer */
10854 return 0;
10855 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010856}
10857
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010858
Benjamin Peterson29060642009-01-31 22:14:21 +000010859#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010860 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010861
Alexander Belopolsky40018472011-02-26 01:02:56 +000010862PyObject *
10863PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010864{
10865 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010866 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010867
Victor Stinnere5567ad2012-10-23 02:48:49 +020010868 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10869 Py_RETURN_NOTIMPLEMENTED;
10870
10871 if (PyUnicode_READY(left) == -1 ||
10872 PyUnicode_READY(right) == -1)
10873 return NULL;
10874
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010875 if (left == right) {
10876 switch (op) {
10877 case Py_EQ:
10878 case Py_LE:
10879 case Py_GE:
10880 /* a string is equal to itself */
10881 v = Py_True;
10882 break;
10883 case Py_NE:
10884 case Py_LT:
10885 case Py_GT:
10886 v = Py_False;
10887 break;
10888 default:
10889 PyErr_BadArgument();
10890 return NULL;
10891 }
10892 }
10893 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010894 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010895 result ^= (op == Py_NE);
10896 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010897 }
10898 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010899 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010900
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010901 /* Convert the return value to a Boolean */
10902 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010903 case Py_LE:
10904 v = TEST_COND(result <= 0);
10905 break;
10906 case Py_GE:
10907 v = TEST_COND(result >= 0);
10908 break;
10909 case Py_LT:
10910 v = TEST_COND(result == -1);
10911 break;
10912 case Py_GT:
10913 v = TEST_COND(result == 1);
10914 break;
10915 default:
10916 PyErr_BadArgument();
10917 return NULL;
10918 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010919 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010920 Py_INCREF(v);
10921 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010922}
10923
Alexander Belopolsky40018472011-02-26 01:02:56 +000010924int
10925PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010926{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010928 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 void *buf1, *buf2;
10930 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010931 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010932
10933 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934 sub = PyUnicode_FromObject(element);
10935 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 PyErr_Format(PyExc_TypeError,
10937 "'in <string>' requires string as left operand, not %s",
10938 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010939 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010940 }
10941
Thomas Wouters477c8d52006-05-27 19:21:47 +000010942 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010943 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010944 Py_DECREF(sub);
10945 return -1;
10946 }
10947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 kind1 = PyUnicode_KIND(str);
10949 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010950 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010952 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010953 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 }
10955 len1 = PyUnicode_GET_LENGTH(str);
10956 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010957 if (len1 < len2) {
10958 Py_DECREF(sub);
10959 Py_DECREF(str);
10960 return 0;
10961 }
10962 buf1 = PyUnicode_DATA(str);
10963 buf2 = PyUnicode_DATA(sub);
10964 if (len2 == 1) {
10965 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10966 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10967 Py_DECREF(sub);
10968 Py_DECREF(str);
10969 return result;
10970 }
10971 if (kind2 != kind1) {
10972 buf2 = _PyUnicode_AsKind(sub, kind1);
10973 if (!buf2) {
10974 Py_DECREF(sub);
10975 Py_DECREF(str);
10976 return -1;
10977 }
10978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979
Victor Stinner77282cb2013-04-14 19:22:47 +020010980 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 case PyUnicode_1BYTE_KIND:
10982 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10983 break;
10984 case PyUnicode_2BYTE_KIND:
10985 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10986 break;
10987 case PyUnicode_4BYTE_KIND:
10988 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10989 break;
10990 default:
10991 result = -1;
10992 assert(0);
10993 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010994
10995 Py_DECREF(str);
10996 Py_DECREF(sub);
10997
Victor Stinner77282cb2013-04-14 19:22:47 +020010998 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 PyMem_Free(buf2);
11000
Guido van Rossum403d68b2000-03-13 15:55:09 +000011001 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011002}
11003
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004/* Concat to string or Unicode object giving a new Unicode object. */
11005
Alexander Belopolsky40018472011-02-26 01:02:56 +000011006PyObject *
11007PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011010 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011011 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012
11013 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020
11021 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011022 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011026 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 }
11030
Victor Stinner488fa492011-12-12 00:01:39 +010011031 u_len = PyUnicode_GET_LENGTH(u);
11032 v_len = PyUnicode_GET_LENGTH(v);
11033 if (u_len > PY_SSIZE_T_MAX - v_len) {
11034 PyErr_SetString(PyExc_OverflowError,
11035 "strings are too large to concat");
11036 goto onError;
11037 }
11038 new_len = u_len + v_len;
11039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011041 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011042 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011045 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011047 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011048 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11049 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 Py_DECREF(u);
11051 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011052 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 Py_XDECREF(u);
11057 Py_XDECREF(v);
11058 return NULL;
11059}
11060
Walter Dörwald1ab83302007-05-18 17:15:44 +000011061void
Victor Stinner23e56682011-10-03 03:54:37 +020011062PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011063{
Victor Stinner23e56682011-10-03 03:54:37 +020011064 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011065 Py_UCS4 maxchar, maxchar2;
11066 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011067
11068 if (p_left == NULL) {
11069 if (!PyErr_Occurred())
11070 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011071 return;
11072 }
Victor Stinner23e56682011-10-03 03:54:37 +020011073 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011074 if (right == NULL || left == NULL
11075 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011076 if (!PyErr_Occurred())
11077 PyErr_BadInternalCall();
11078 goto error;
11079 }
11080
Benjamin Petersonbac79492012-01-14 13:34:47 -050011081 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011082 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011083 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011084 goto error;
11085
Victor Stinner488fa492011-12-12 00:01:39 +010011086 /* Shortcuts */
11087 if (left == unicode_empty) {
11088 Py_DECREF(left);
11089 Py_INCREF(right);
11090 *p_left = right;
11091 return;
11092 }
11093 if (right == unicode_empty)
11094 return;
11095
11096 left_len = PyUnicode_GET_LENGTH(left);
11097 right_len = PyUnicode_GET_LENGTH(right);
11098 if (left_len > PY_SSIZE_T_MAX - right_len) {
11099 PyErr_SetString(PyExc_OverflowError,
11100 "strings are too large to concat");
11101 goto error;
11102 }
11103 new_len = left_len + right_len;
11104
11105 if (unicode_modifiable(left)
11106 && PyUnicode_CheckExact(right)
11107 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011108 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11109 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011110 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011111 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011112 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11113 {
11114 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011115 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011116 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011117
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011118 /* copy 'right' into the newly allocated area of 'left' */
11119 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011120 }
Victor Stinner488fa492011-12-12 00:01:39 +010011121 else {
11122 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11123 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011124 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011125
Victor Stinner488fa492011-12-12 00:01:39 +010011126 /* Concat the two Unicode strings */
11127 res = PyUnicode_New(new_len, maxchar);
11128 if (res == NULL)
11129 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011130 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11131 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011132 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011133 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011134 }
11135 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011136 return;
11137
11138error:
Victor Stinner488fa492011-12-12 00:01:39 +010011139 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011140}
11141
11142void
11143PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11144{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011145 PyUnicode_Append(pleft, right);
11146 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011147}
11148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011149PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011152Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011153string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011154interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155
11156static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011157unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011159 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011160 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011161 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011163 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 void *buf1, *buf2;
11165 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166
Jesus Ceaac451502011-04-20 17:09:23 +020011167 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11168 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011169 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 kind1 = PyUnicode_KIND(self);
11172 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011173 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011174 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011175 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 len1 = PyUnicode_GET_LENGTH(self);
11178 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011180 if (end - start < len2) {
11181 Py_DECREF(substring);
11182 return PyLong_FromLong(0);
11183 }
11184 buf1 = PyUnicode_DATA(self);
11185 buf2 = PyUnicode_DATA(substring);
11186 if (kind2 != kind1) {
11187 buf2 = _PyUnicode_AsKind(substring, kind1);
11188 if (!buf2) {
11189 Py_DECREF(substring);
11190 return NULL;
11191 }
11192 }
11193 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 case PyUnicode_1BYTE_KIND:
11195 iresult = ucs1lib_count(
11196 ((Py_UCS1*)buf1) + start, end - start,
11197 buf2, len2, PY_SSIZE_T_MAX
11198 );
11199 break;
11200 case PyUnicode_2BYTE_KIND:
11201 iresult = ucs2lib_count(
11202 ((Py_UCS2*)buf1) + start, end - start,
11203 buf2, len2, PY_SSIZE_T_MAX
11204 );
11205 break;
11206 case PyUnicode_4BYTE_KIND:
11207 iresult = ucs4lib_count(
11208 ((Py_UCS4*)buf1) + start, end - start,
11209 buf2, len2, PY_SSIZE_T_MAX
11210 );
11211 break;
11212 default:
11213 assert(0); iresult = 0;
11214 }
11215
11216 result = PyLong_FromSsize_t(iresult);
11217
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011218 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
11221 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011222
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 return result;
11224}
11225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011227 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011229Encode S using the codec registered for encoding. Default encoding\n\
11230is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011231handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011232a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11233'xmlcharrefreplace' as well as any other name registered with\n\
11234codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
11236static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011237unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011239 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 char *encoding = NULL;
11241 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011242
Benjamin Peterson308d6372009-09-18 21:42:35 +000011243 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11244 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011246 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011247}
11248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011249PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011250 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251\n\
11252Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
11255static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011256unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011258 Py_ssize_t i, j, line_pos, src_len, incr;
11259 Py_UCS4 ch;
11260 PyObject *u;
11261 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011262 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011264 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011265 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266
Ezio Melotti745d54d2013-11-16 19:10:57 +020011267 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11268 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
Antoine Pitrou22425222011-10-04 19:10:51 +020011271 if (PyUnicode_READY(self) == -1)
11272 return NULL;
11273
Thomas Wouters7e474022000-07-16 12:04:32 +000011274 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 src_len = PyUnicode_GET_LENGTH(self);
11276 i = j = line_pos = 0;
11277 kind = PyUnicode_KIND(self);
11278 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011279 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 for (; i < src_len; i++) {
11281 ch = PyUnicode_READ(kind, src_data, i);
11282 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011283 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011287 goto overflow;
11288 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011289 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011290 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011294 goto overflow;
11295 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 if (ch == '\n' || ch == '\r')
11298 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011301 if (!found)
11302 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011303
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011305 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 if (!u)
11307 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011308 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311
Antoine Pitroue71d5742011-10-04 15:55:09 +020011312 for (; i < src_len; i++) {
11313 ch = PyUnicode_READ(kind, src_data, i);
11314 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011316 incr = tabsize - (line_pos % tabsize);
11317 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011318 FILL(kind, dest_data, ' ', j, incr);
11319 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011321 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011323 line_pos++;
11324 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011325 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011326 if (ch == '\n' || ch == '\r')
11327 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011329 }
11330 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011331 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011332
Antoine Pitroue71d5742011-10-04 15:55:09 +020011333 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011334 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336}
11337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011338PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340\n\
11341Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011342such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343arguments start and end are interpreted as in slice notation.\n\
11344\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011345Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
11347static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011350 /* initialize variables to prevent gcc warning */
11351 PyObject *substring = NULL;
11352 Py_ssize_t start = 0;
11353 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011354 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355
Jesus Ceaac451502011-04-20 17:09:23 +020011356 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11357 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
Christian Heimesd47802e2013-06-29 21:33:36 +020011360 if (PyUnicode_READY(self) == -1) {
11361 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011363 }
11364 if (PyUnicode_READY(substring) == -1) {
11365 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368
Victor Stinner7931d9a2011-11-04 00:22:48 +010011369 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
11371 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (result == -2)
11374 return NULL;
11375
Christian Heimes217cfd12007-12-02 14:31:20 +000011376 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377}
11378
11379static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011380unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011382 void *data;
11383 enum PyUnicode_Kind kind;
11384 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011385
11386 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11387 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011389 }
11390 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11391 PyErr_SetString(PyExc_IndexError, "string index out of range");
11392 return NULL;
11393 }
11394 kind = PyUnicode_KIND(self);
11395 data = PyUnicode_DATA(self);
11396 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011397 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398}
11399
Guido van Rossumc2504932007-09-18 19:42:40 +000011400/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011401 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011402static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011403unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404{
Guido van Rossumc2504932007-09-18 19:42:40 +000011405 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011406 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011407
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011408#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011409 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011410#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 if (_PyUnicode_HASH(self) != -1)
11412 return _PyUnicode_HASH(self);
11413 if (PyUnicode_READY(self) == -1)
11414 return -1;
11415 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011416 /*
11417 We make the hash of the empty string be 0, rather than using
11418 (prefix ^ suffix), since this slightly obfuscates the hash secret
11419 */
11420 if (len == 0) {
11421 _PyUnicode_HASH(self) = 0;
11422 return 0;
11423 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011424 x = _Py_HashBytes(PyUnicode_DATA(self),
11425 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011427 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428}
11429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011438 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011439 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011440 PyObject *substring = NULL;
11441 Py_ssize_t start = 0;
11442 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Jesus Ceaac451502011-04-20 17:09:23 +020011444 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11445 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
Christian Heimesd47a0452013-06-29 21:21:37 +020011448 if (PyUnicode_READY(self) == -1) {
11449 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011451 }
11452 if (PyUnicode_READY(substring) == -1) {
11453 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456
Victor Stinner7931d9a2011-11-04 00:22:48 +010011457 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458
11459 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 if (result == -2)
11462 return NULL;
11463
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 if (result < 0) {
11465 PyErr_SetString(PyExc_ValueError, "substring not found");
11466 return NULL;
11467 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011468
Christian Heimes217cfd12007-12-02 14:31:20 +000011469 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470}
11471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011472PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011475Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011476at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
11478static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011479unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 Py_ssize_t i, length;
11482 int kind;
11483 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 int cased;
11485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (PyUnicode_READY(self) == -1)
11487 return NULL;
11488 length = PyUnicode_GET_LENGTH(self);
11489 kind = PyUnicode_KIND(self);
11490 data = PyUnicode_DATA(self);
11491
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (length == 1)
11494 return PyBool_FromLong(
11495 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011497 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011500
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 for (i = 0; i < length; i++) {
11503 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011504
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11506 return PyBool_FromLong(0);
11507 else if (!cased && Py_UNICODE_ISLOWER(ch))
11508 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011510 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511}
11512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011513PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011516Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011517at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
11519static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011520unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 Py_ssize_t i, length;
11523 int kind;
11524 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 int cased;
11526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (PyUnicode_READY(self) == -1)
11528 return NULL;
11529 length = PyUnicode_GET_LENGTH(self);
11530 kind = PyUnicode_KIND(self);
11531 data = PyUnicode_DATA(self);
11532
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 if (length == 1)
11535 return PyBool_FromLong(
11536 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011538 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011541
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 for (i = 0; i < length; i++) {
11544 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011545
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11547 return PyBool_FromLong(0);
11548 else if (!cased && Py_UNICODE_ISUPPER(ch))
11549 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011551 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552}
11553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011554PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011557Return True if S is a titlecased string and there is at least one\n\
11558character in S, i.e. upper- and titlecase characters may only\n\
11559follow uncased characters and lowercase characters only cased ones.\n\
11560Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561
11562static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011563unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 Py_ssize_t i, length;
11566 int kind;
11567 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568 int cased, previous_is_cased;
11569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 if (PyUnicode_READY(self) == -1)
11571 return NULL;
11572 length = PyUnicode_GET_LENGTH(self);
11573 kind = PyUnicode_KIND(self);
11574 data = PyUnicode_DATA(self);
11575
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (length == 1) {
11578 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11579 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11580 (Py_UNICODE_ISUPPER(ch) != 0));
11581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011583 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011586
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587 cased = 0;
11588 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 for (i = 0; i < length; i++) {
11590 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011591
Benjamin Peterson29060642009-01-31 22:14:21 +000011592 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11593 if (previous_is_cased)
11594 return PyBool_FromLong(0);
11595 previous_is_cased = 1;
11596 cased = 1;
11597 }
11598 else if (Py_UNICODE_ISLOWER(ch)) {
11599 if (!previous_is_cased)
11600 return PyBool_FromLong(0);
11601 previous_is_cased = 1;
11602 cased = 1;
11603 }
11604 else
11605 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011607 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608}
11609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011610PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011613Return True if all characters in S are whitespace\n\
11614and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615
11616static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011617unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 Py_ssize_t i, length;
11620 int kind;
11621 void *data;
11622
11623 if (PyUnicode_READY(self) == -1)
11624 return NULL;
11625 length = PyUnicode_GET_LENGTH(self);
11626 kind = PyUnicode_KIND(self);
11627 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (length == 1)
11631 return PyBool_FromLong(
11632 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011634 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 for (i = 0; i < length; i++) {
11639 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011640 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644}
11645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011646PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011648\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011649Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011651
11652static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011653unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 Py_ssize_t i, length;
11656 int kind;
11657 void *data;
11658
11659 if (PyUnicode_READY(self) == -1)
11660 return NULL;
11661 length = PyUnicode_GET_LENGTH(self);
11662 kind = PyUnicode_KIND(self);
11663 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011664
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011665 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (length == 1)
11667 return PyBool_FromLong(
11668 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011669
11670 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 for (i = 0; i < length; i++) {
11675 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011677 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011678 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011679}
11680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011681PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011683\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011684Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011685and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011686
11687static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011688unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 int kind;
11691 void *data;
11692 Py_ssize_t len, i;
11693
11694 if (PyUnicode_READY(self) == -1)
11695 return NULL;
11696
11697 kind = PyUnicode_KIND(self);
11698 data = PyUnicode_DATA(self);
11699 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011700
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011701 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 if (len == 1) {
11703 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11704 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11705 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011706
11707 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 for (i = 0; i < len; i++) {
11712 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011713 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011716 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011717}
11718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011719PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011722Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
11725static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011726unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 Py_ssize_t i, length;
11729 int kind;
11730 void *data;
11731
11732 if (PyUnicode_READY(self) == -1)
11733 return NULL;
11734 length = PyUnicode_GET_LENGTH(self);
11735 kind = PyUnicode_KIND(self);
11736 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (length == 1)
11740 return PyBool_FromLong(
11741 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011743 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 for (i = 0; i < length; i++) {
11748 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011751 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752}
11753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011754PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011757Return True if all characters in S are digits\n\
11758and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759
11760static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011761unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 Py_ssize_t i, length;
11764 int kind;
11765 void *data;
11766
11767 if (PyUnicode_READY(self) == -1)
11768 return NULL;
11769 length = PyUnicode_GET_LENGTH(self);
11770 kind = PyUnicode_KIND(self);
11771 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (length == 1) {
11775 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11776 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011779 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 for (i = 0; i < length; i++) {
11784 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011787 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788}
11789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011790PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011793Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011794False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795
11796static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011797unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 Py_ssize_t i, length;
11800 int kind;
11801 void *data;
11802
11803 if (PyUnicode_READY(self) == -1)
11804 return NULL;
11805 length = PyUnicode_GET_LENGTH(self);
11806 kind = PyUnicode_KIND(self);
11807 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (length == 1)
11811 return PyBool_FromLong(
11812 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011814 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 for (i = 0; i < length; i++) {
11819 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011822 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823}
11824
Martin v. Löwis47383402007-08-15 07:32:56 +000011825int
11826PyUnicode_IsIdentifier(PyObject *self)
11827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 int kind;
11829 void *data;
11830 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011831 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 if (PyUnicode_READY(self) == -1) {
11834 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 }
11837
11838 /* Special case for empty strings */
11839 if (PyUnicode_GET_LENGTH(self) == 0)
11840 return 0;
11841 kind = PyUnicode_KIND(self);
11842 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011843
11844 /* PEP 3131 says that the first character must be in
11845 XID_Start and subsequent characters in XID_Continue,
11846 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011847 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011848 letters, digits, underscore). However, given the current
11849 definition of XID_Start and XID_Continue, it is sufficient
11850 to check just for these, except that _ must be allowed
11851 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011853 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011854 return 0;
11855
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011856 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011858 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011859 return 1;
11860}
11861
11862PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011864\n\
11865Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011866to the language definition.\n\
11867\n\
11868Use keyword.iskeyword() to test for reserved identifiers\n\
11869such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011870
11871static PyObject*
11872unicode_isidentifier(PyObject *self)
11873{
11874 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11875}
11876
Georg Brandl559e5d72008-06-11 18:37:52 +000011877PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011879\n\
11880Return True if all characters in S are considered\n\
11881printable in repr() or S is empty, False otherwise.");
11882
11883static PyObject*
11884unicode_isprintable(PyObject *self)
11885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 Py_ssize_t i, length;
11887 int kind;
11888 void *data;
11889
11890 if (PyUnicode_READY(self) == -1)
11891 return NULL;
11892 length = PyUnicode_GET_LENGTH(self);
11893 kind = PyUnicode_KIND(self);
11894 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011895
11896 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (length == 1)
11898 return PyBool_FromLong(
11899 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 for (i = 0; i < length; i++) {
11902 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011903 Py_RETURN_FALSE;
11904 }
11905 }
11906 Py_RETURN_TRUE;
11907}
11908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011909PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011910 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911\n\
11912Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011913iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
11915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011916unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011918 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919}
11920
Martin v. Löwis18e16552006-02-15 17:27:45 +000011921static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011922unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 if (PyUnicode_READY(self) == -1)
11925 return -1;
11926 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927}
11928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011929PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011932Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011933done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
11935static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011936unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011938 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 Py_UCS4 fillchar = ' ';
11940
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011941 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 return NULL;
11943
Benjamin Petersonbac79492012-01-14 13:34:47 -050011944 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
Victor Stinnerc4b49542011-12-11 22:44:26 +010011947 if (PyUnicode_GET_LENGTH(self) >= width)
11948 return unicode_result_unchanged(self);
11949
11950 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011953PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957
11958static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011959unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011961 if (PyUnicode_READY(self) == -1)
11962 return NULL;
11963 if (PyUnicode_IS_ASCII(self))
11964 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011965 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966}
11967
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011968#define LEFTSTRIP 0
11969#define RIGHTSTRIP 1
11970#define BOTHSTRIP 2
11971
11972/* Arrays indexed by above */
11973static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11974
11975#define STRIPNAME(i) (stripformat[i]+3)
11976
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011977/* externally visible for str.strip(unicode) */
11978PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011979_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 void *data;
11982 int kind;
11983 Py_ssize_t i, j, len;
11984 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011985 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11988 return NULL;
11989
11990 kind = PyUnicode_KIND(self);
11991 data = PyUnicode_DATA(self);
11992 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011993 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11995 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011996 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011997
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 i = 0;
11999 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012000 while (i < len) {
12001 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12002 if (!BLOOM(sepmask, ch))
12003 break;
12004 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12005 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 i++;
12007 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012008 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012009
Benjamin Peterson14339b62009-01-31 16:36:08 +000012010 j = len;
12011 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012012 j--;
12013 while (j >= i) {
12014 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12015 if (!BLOOM(sepmask, ch))
12016 break;
12017 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12018 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012020 }
12021
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012023 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012024
Victor Stinner7931d9a2011-11-04 00:22:48 +010012025 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026}
12027
12028PyObject*
12029PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12030{
12031 unsigned char *data;
12032 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012033 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034
Victor Stinnerde636f32011-10-01 03:55:54 +020012035 if (PyUnicode_READY(self) == -1)
12036 return NULL;
12037
Victor Stinner684d5fd2012-05-03 02:32:34 +020012038 length = PyUnicode_GET_LENGTH(self);
12039 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012040
Victor Stinner684d5fd2012-05-03 02:32:34 +020012041 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012042 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043
Victor Stinnerde636f32011-10-01 03:55:54 +020012044 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012045 PyErr_SetString(PyExc_IndexError, "string index out of range");
12046 return NULL;
12047 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012048 if (start >= length || end < start)
12049 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012050
Victor Stinner684d5fd2012-05-03 02:32:34 +020012051 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012052 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012053 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012054 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012055 }
12056 else {
12057 kind = PyUnicode_KIND(self);
12058 data = PyUnicode_1BYTE_DATA(self);
12059 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012060 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012061 length);
12062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064
12065static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012066do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 Py_ssize_t len, i, j;
12069
12070 if (PyUnicode_READY(self) == -1)
12071 return NULL;
12072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012074
Victor Stinnercc7af722013-04-09 22:39:24 +020012075 if (PyUnicode_IS_ASCII(self)) {
12076 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12077
12078 i = 0;
12079 if (striptype != RIGHTSTRIP) {
12080 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012081 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012082 if (!_Py_ascii_whitespace[ch])
12083 break;
12084 i++;
12085 }
12086 }
12087
12088 j = len;
12089 if (striptype != LEFTSTRIP) {
12090 j--;
12091 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012092 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012093 if (!_Py_ascii_whitespace[ch])
12094 break;
12095 j--;
12096 }
12097 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012098 }
12099 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012100 else {
12101 int kind = PyUnicode_KIND(self);
12102 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103
Victor Stinnercc7af722013-04-09 22:39:24 +020012104 i = 0;
12105 if (striptype != RIGHTSTRIP) {
12106 while (i < len) {
12107 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12108 if (!Py_UNICODE_ISSPACE(ch))
12109 break;
12110 i++;
12111 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012112 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012113
12114 j = len;
12115 if (striptype != LEFTSTRIP) {
12116 j--;
12117 while (j >= i) {
12118 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12119 if (!Py_UNICODE_ISSPACE(ch))
12120 break;
12121 j--;
12122 }
12123 j++;
12124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126
Victor Stinner7931d9a2011-11-04 00:22:48 +010012127 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128}
12129
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130
12131static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012132do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012134 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135
Serhiy Storchakac6792272013-10-19 21:03:34 +030012136 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 if (sep != NULL && sep != Py_None) {
12140 if (PyUnicode_Check(sep))
12141 return _PyUnicode_XStrip(self, striptype, sep);
12142 else {
12143 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 "%s arg must be None or str",
12145 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146 return NULL;
12147 }
12148 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151}
12152
12153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012154PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012155 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156\n\
12157Return a copy of the string S with leading and trailing\n\
12158whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012159If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160
12161static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012162unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012163{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012164 if (PyTuple_GET_SIZE(args) == 0)
12165 return do_strip(self, BOTHSTRIP); /* Common case */
12166 else
12167 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012168}
12169
12170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012171PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012173\n\
12174Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012175If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012176
12177static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012178unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012179{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012180 if (PyTuple_GET_SIZE(args) == 0)
12181 return do_strip(self, LEFTSTRIP); /* Common case */
12182 else
12183 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012184}
12185
12186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012187PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012189\n\
12190Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012191If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012192
12193static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012194unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012195{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012196 if (PyTuple_GET_SIZE(args) == 0)
12197 return do_strip(self, RIGHTSTRIP); /* Common case */
12198 else
12199 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012200}
12201
12202
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012204unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012206 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
Serhiy Storchaka05997252013-01-26 12:14:02 +020012209 if (len < 1)
12210 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211
Victor Stinnerc4b49542011-12-11 22:44:26 +010012212 /* no repeat, return original string */
12213 if (len == 1)
12214 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012215
Benjamin Petersonbac79492012-01-14 13:34:47 -050012216 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 return NULL;
12218
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012219 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012220 PyErr_SetString(PyExc_OverflowError,
12221 "repeated string is too long");
12222 return NULL;
12223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012225
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012226 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 if (!u)
12228 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012229 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (PyUnicode_GET_LENGTH(str) == 1) {
12232 const int kind = PyUnicode_KIND(str);
12233 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012234 if (kind == PyUnicode_1BYTE_KIND) {
12235 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012236 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012237 }
12238 else if (kind == PyUnicode_2BYTE_KIND) {
12239 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012240 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012241 ucs2[n] = fill_char;
12242 } else {
12243 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12244 assert(kind == PyUnicode_4BYTE_KIND);
12245 for (n = 0; n < len; ++n)
12246 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 }
12249 else {
12250 /* number of characters copied this far */
12251 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012252 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 char *to = (char *) PyUnicode_DATA(u);
12254 Py_MEMCPY(to, PyUnicode_DATA(str),
12255 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 n = (done <= nchars-done) ? done : nchars-done;
12258 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012259 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 }
12262
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012263 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012264 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
Alexander Belopolsky40018472011-02-26 01:02:56 +000012267PyObject *
12268PyUnicode_Replace(PyObject *obj,
12269 PyObject *subobj,
12270 PyObject *replobj,
12271 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
12273 PyObject *self;
12274 PyObject *str1;
12275 PyObject *str2;
12276 PyObject *result;
12277
12278 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012279 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012282 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 Py_DECREF(self);
12284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 }
12286 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012287 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 Py_DECREF(self);
12289 Py_DECREF(str1);
12290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012292 if (PyUnicode_READY(self) == -1 ||
12293 PyUnicode_READY(str1) == -1 ||
12294 PyUnicode_READY(str2) == -1)
12295 result = NULL;
12296 else
12297 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298 Py_DECREF(self);
12299 Py_DECREF(str1);
12300 Py_DECREF(str2);
12301 return result;
12302}
12303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012304PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012305 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306\n\
12307Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012308old replaced by new. If the optional argument count is\n\
12309given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310
12311static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 PyObject *str1;
12315 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012316 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317 PyObject *result;
12318
Martin v. Löwis18e16552006-02-15 17:27:45 +000012319 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012321 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012324 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 return NULL;
12326 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012327 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 Py_DECREF(str1);
12329 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012330 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012331 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12332 result = NULL;
12333 else
12334 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335
12336 Py_DECREF(str1);
12337 Py_DECREF(str2);
12338 return result;
12339}
12340
Alexander Belopolsky40018472011-02-26 01:02:56 +000012341static PyObject *
12342unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012344 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 Py_ssize_t isize;
12346 Py_ssize_t osize, squote, dquote, i, o;
12347 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012348 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012352 return NULL;
12353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 isize = PyUnicode_GET_LENGTH(unicode);
12355 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 /* Compute length of output, quote characters, and
12358 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012359 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 max = 127;
12361 squote = dquote = 0;
12362 ikind = PyUnicode_KIND(unicode);
12363 for (i = 0; i < isize; i++) {
12364 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012365 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012367 case '\'': squote++; break;
12368 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012370 incr = 2;
12371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 default:
12373 /* Fast-path ASCII */
12374 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012375 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012377 ;
12378 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012381 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012383 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012385 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012387 if (osize > PY_SSIZE_T_MAX - incr) {
12388 PyErr_SetString(PyExc_OverflowError,
12389 "string is too long to generate repr");
12390 return NULL;
12391 }
12392 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 }
12394
12395 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012396 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012398 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 if (dquote)
12400 /* Both squote and dquote present. Use squote,
12401 and escape them */
12402 osize += squote;
12403 else
12404 quote = '"';
12405 }
Victor Stinner55c08782013-04-14 18:45:39 +020012406 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407
12408 repr = PyUnicode_New(osize, max);
12409 if (repr == NULL)
12410 return NULL;
12411 okind = PyUnicode_KIND(repr);
12412 odata = PyUnicode_DATA(repr);
12413
12414 PyUnicode_WRITE(okind, odata, 0, quote);
12415 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012416 if (unchanged) {
12417 _PyUnicode_FastCopyCharacters(repr, 1,
12418 unicode, 0,
12419 isize);
12420 }
12421 else {
12422 for (i = 0, o = 1; i < isize; i++) {
12423 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424
Victor Stinner55c08782013-04-14 18:45:39 +020012425 /* Escape quotes and backslashes */
12426 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012427 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012429 continue;
12430 }
12431
12432 /* Map special whitespace to '\t', \n', '\r' */
12433 if (ch == '\t') {
12434 PyUnicode_WRITE(okind, odata, o++, '\\');
12435 PyUnicode_WRITE(okind, odata, o++, 't');
12436 }
12437 else if (ch == '\n') {
12438 PyUnicode_WRITE(okind, odata, o++, '\\');
12439 PyUnicode_WRITE(okind, odata, o++, 'n');
12440 }
12441 else if (ch == '\r') {
12442 PyUnicode_WRITE(okind, odata, o++, '\\');
12443 PyUnicode_WRITE(okind, odata, o++, 'r');
12444 }
12445
12446 /* Map non-printable US ASCII to '\xhh' */
12447 else if (ch < ' ' || ch == 0x7F) {
12448 PyUnicode_WRITE(okind, odata, o++, '\\');
12449 PyUnicode_WRITE(okind, odata, o++, 'x');
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12452 }
12453
12454 /* Copy ASCII characters as-is */
12455 else if (ch < 0x7F) {
12456 PyUnicode_WRITE(okind, odata, o++, ch);
12457 }
12458
12459 /* Non-ASCII characters */
12460 else {
12461 /* Map Unicode whitespace and control characters
12462 (categories Z* and C* except ASCII space)
12463 */
12464 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12465 PyUnicode_WRITE(okind, odata, o++, '\\');
12466 /* Map 8-bit characters to '\xhh' */
12467 if (ch <= 0xff) {
12468 PyUnicode_WRITE(okind, odata, o++, 'x');
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12471 }
12472 /* Map 16-bit characters to '\uxxxx' */
12473 else if (ch <= 0xffff) {
12474 PyUnicode_WRITE(okind, odata, o++, 'u');
12475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12476 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12477 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12478 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12479 }
12480 /* Map 21-bit characters to '\U00xxxxxx' */
12481 else {
12482 PyUnicode_WRITE(okind, odata, o++, 'U');
12483 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12484 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12485 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12486 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12487 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12488 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12489 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12490 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12491 }
12492 }
12493 /* Copy characters as-is */
12494 else {
12495 PyUnicode_WRITE(okind, odata, o++, ch);
12496 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012497 }
12498 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012501 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012502 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503}
12504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012505PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507\n\
12508Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012509such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510arguments start and end are interpreted as in slice notation.\n\
12511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
12514static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012517 /* initialize variables to prevent gcc warning */
12518 PyObject *substring = NULL;
12519 Py_ssize_t start = 0;
12520 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012521 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
Jesus Ceaac451502011-04-20 17:09:23 +020012523 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12524 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526
Christian Heimesea71a522013-06-29 21:17:34 +020012527 if (PyUnicode_READY(self) == -1) {
12528 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012530 }
12531 if (PyUnicode_READY(substring) == -1) {
12532 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535
Victor Stinner7931d9a2011-11-04 00:22:48 +010012536 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
12538 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 if (result == -2)
12541 return NULL;
12542
Christian Heimes217cfd12007-12-02 14:31:20 +000012543 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544}
12545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012546PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012549Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
12551static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012554 /* initialize variables to prevent gcc warning */
12555 PyObject *substring = NULL;
12556 Py_ssize_t start = 0;
12557 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012558 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
Jesus Ceaac451502011-04-20 17:09:23 +020012560 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12561 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
Christian Heimesea71a522013-06-29 21:17:34 +020012564 if (PyUnicode_READY(self) == -1) {
12565 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012567 }
12568 if (PyUnicode_READY(substring) == -1) {
12569 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572
Victor Stinner7931d9a2011-11-04 00:22:48 +010012573 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574
12575 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 if (result == -2)
12578 return NULL;
12579
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580 if (result < 0) {
12581 PyErr_SetString(PyExc_ValueError, "substring not found");
12582 return NULL;
12583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584
Christian Heimes217cfd12007-12-02 14:31:20 +000012585 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586}
12587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012588PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012591Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012592done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593
12594static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012595unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012597 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 Py_UCS4 fillchar = ' ';
12599
Victor Stinnere9a29352011-10-01 02:14:59 +020012600 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012602
Benjamin Petersonbac79492012-01-14 13:34:47 -050012603 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604 return NULL;
12605
Victor Stinnerc4b49542011-12-11 22:44:26 +010012606 if (PyUnicode_GET_LENGTH(self) >= width)
12607 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608
Victor Stinnerc4b49542011-12-11 22:44:26 +010012609 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610}
12611
Alexander Belopolsky40018472011-02-26 01:02:56 +000012612PyObject *
12613PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614{
12615 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012616
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617 s = PyUnicode_FromObject(s);
12618 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012619 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 if (sep != NULL) {
12621 sep = PyUnicode_FromObject(sep);
12622 if (sep == NULL) {
12623 Py_DECREF(s);
12624 return NULL;
12625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626 }
12627
Victor Stinner9310abb2011-10-05 00:59:23 +020012628 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629
12630 Py_DECREF(s);
12631 Py_XDECREF(sep);
12632 return result;
12633}
12634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012635PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012636 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637\n\
12638Return a list of the words in S, using sep as the\n\
12639delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012640splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012641whitespace string is a separator and empty strings are\n\
12642removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643
12644static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012645unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012647 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012649 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012651 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12652 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653 return NULL;
12654
12655 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012658 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012660 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661}
12662
Thomas Wouters477c8d52006-05-27 19:21:47 +000012663PyObject *
12664PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12665{
12666 PyObject* str_obj;
12667 PyObject* sep_obj;
12668 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012669 int kind1, kind2;
12670 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012672
12673 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012674 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012675 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012676 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012677 if (!sep_obj) {
12678 Py_DECREF(str_obj);
12679 return NULL;
12680 }
12681 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12682 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012683 Py_DECREF(str_obj);
12684 return NULL;
12685 }
12686
Victor Stinner14f8f022011-10-05 20:58:25 +020012687 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 len1 = PyUnicode_GET_LENGTH(str_obj);
12690 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012691 if (kind1 < kind2 || len1 < len2) {
12692 _Py_INCREF_UNICODE_EMPTY();
12693 if (!unicode_empty)
12694 out = NULL;
12695 else {
12696 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12697 Py_DECREF(unicode_empty);
12698 }
12699 Py_DECREF(sep_obj);
12700 Py_DECREF(str_obj);
12701 return out;
12702 }
12703 buf1 = PyUnicode_DATA(str_obj);
12704 buf2 = PyUnicode_DATA(sep_obj);
12705 if (kind2 != kind1) {
12706 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12707 if (!buf2)
12708 goto onError;
12709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012711 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012713 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12714 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12715 else
12716 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 break;
12718 case PyUnicode_2BYTE_KIND:
12719 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12720 break;
12721 case PyUnicode_4BYTE_KIND:
12722 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12723 break;
12724 default:
12725 assert(0);
12726 out = 0;
12727 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012728
12729 Py_DECREF(sep_obj);
12730 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012731 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012733
12734 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 onError:
12736 Py_DECREF(sep_obj);
12737 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012738 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 PyMem_Free(buf2);
12740 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012741}
12742
12743
12744PyObject *
12745PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12746{
12747 PyObject* str_obj;
12748 PyObject* sep_obj;
12749 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012750 int kind1, kind2;
12751 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012753
12754 str_obj = PyUnicode_FromObject(str_in);
12755 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757 sep_obj = PyUnicode_FromObject(sep_in);
12758 if (!sep_obj) {
12759 Py_DECREF(str_obj);
12760 return NULL;
12761 }
12762
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012763 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 len1 = PyUnicode_GET_LENGTH(str_obj);
12766 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012767 if (kind1 < kind2 || len1 < len2) {
12768 _Py_INCREF_UNICODE_EMPTY();
12769 if (!unicode_empty)
12770 out = NULL;
12771 else {
12772 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12773 Py_DECREF(unicode_empty);
12774 }
12775 Py_DECREF(sep_obj);
12776 Py_DECREF(str_obj);
12777 return out;
12778 }
12779 buf1 = PyUnicode_DATA(str_obj);
12780 buf2 = PyUnicode_DATA(sep_obj);
12781 if (kind2 != kind1) {
12782 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12783 if (!buf2)
12784 goto onError;
12785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012787 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012789 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12790 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12791 else
12792 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 break;
12794 case PyUnicode_2BYTE_KIND:
12795 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12796 break;
12797 case PyUnicode_4BYTE_KIND:
12798 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12799 break;
12800 default:
12801 assert(0);
12802 out = 0;
12803 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804
12805 Py_DECREF(sep_obj);
12806 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012807 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809
12810 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 onError:
12812 Py_DECREF(sep_obj);
12813 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012814 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 PyMem_Free(buf2);
12816 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817}
12818
12819PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012822Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012824found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825
12826static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012827unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828{
Victor Stinner9310abb2011-10-05 00:59:23 +020012829 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012830}
12831
12832PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012833 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012834\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012835Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012836the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012837separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012838
12839static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012840unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012841{
Victor Stinner9310abb2011-10-05 00:59:23 +020012842 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012843}
12844
Alexander Belopolsky40018472011-02-26 01:02:56 +000012845PyObject *
12846PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012847{
12848 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012849
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012850 s = PyUnicode_FromObject(s);
12851 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012852 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 if (sep != NULL) {
12854 sep = PyUnicode_FromObject(sep);
12855 if (sep == NULL) {
12856 Py_DECREF(s);
12857 return NULL;
12858 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012859 }
12860
Victor Stinner9310abb2011-10-05 00:59:23 +020012861 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012862
12863 Py_DECREF(s);
12864 Py_XDECREF(sep);
12865 return result;
12866}
12867
12868PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012869 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012870\n\
12871Return a list of the words in S, using sep as the\n\
12872delimiter string, starting at the end of the string and\n\
12873working to the front. If maxsplit is given, at most maxsplit\n\
12874splits are done. If sep is not specified, any whitespace string\n\
12875is a separator.");
12876
12877static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012878unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012879{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012880 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012881 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012882 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012883
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012884 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12885 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012886 return NULL;
12887
12888 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012889 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012890 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012891 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012892 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012893 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012894}
12895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012896PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898\n\
12899Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012900Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012901is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902
12903static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012904unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012906 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012907 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012909 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12910 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911 return NULL;
12912
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012913 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914}
12915
12916static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012917PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012919 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920}
12921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012922PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924\n\
12925Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012926and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927
12928static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012929unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012931 if (PyUnicode_READY(self) == -1)
12932 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012933 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934}
12935
Larry Hastings61272b72014-01-07 12:41:53 -080012936/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012937
Larry Hastings31826802013-10-19 00:09:25 -070012938@staticmethod
12939str.maketrans as unicode_maketrans
12940
12941 x: object
12942
12943 y: unicode=NULL
12944
12945 z: unicode=NULL
12946
12947 /
12948
12949Return a translation table usable for str.translate().
12950
12951If there is only one argument, it must be a dictionary mapping Unicode
12952ordinals (integers) or characters to Unicode ordinals, strings or None.
12953Character keys will be then converted to ordinals.
12954If there are two arguments, they must be strings of equal length, and
12955in the resulting dictionary, each character in x will be mapped to the
12956character at the same position in y. If there is a third argument, it
12957must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012958[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012959
Larry Hastings31826802013-10-19 00:09:25 -070012960static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012961unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012962/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012963{
Georg Brandlceee0772007-11-27 23:48:05 +000012964 PyObject *new = NULL, *key, *value;
12965 Py_ssize_t i = 0;
12966 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012967
Georg Brandlceee0772007-11-27 23:48:05 +000012968 new = PyDict_New();
12969 if (!new)
12970 return NULL;
12971 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 int x_kind, y_kind, z_kind;
12973 void *x_data, *y_data, *z_data;
12974
Georg Brandlceee0772007-11-27 23:48:05 +000012975 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012976 if (!PyUnicode_Check(x)) {
12977 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12978 "be a string if there is a second argument");
12979 goto err;
12980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012982 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12983 "arguments must have equal length");
12984 goto err;
12985 }
12986 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 x_kind = PyUnicode_KIND(x);
12988 y_kind = PyUnicode_KIND(y);
12989 x_data = PyUnicode_DATA(x);
12990 y_data = PyUnicode_DATA(y);
12991 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12992 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012993 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012994 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012995 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012996 if (!value) {
12997 Py_DECREF(key);
12998 goto err;
12999 }
Georg Brandlceee0772007-11-27 23:48:05 +000013000 res = PyDict_SetItem(new, key, value);
13001 Py_DECREF(key);
13002 Py_DECREF(value);
13003 if (res < 0)
13004 goto err;
13005 }
13006 /* create entries for deleting chars in z */
13007 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 z_kind = PyUnicode_KIND(z);
13009 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013010 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013012 if (!key)
13013 goto err;
13014 res = PyDict_SetItem(new, key, Py_None);
13015 Py_DECREF(key);
13016 if (res < 0)
13017 goto err;
13018 }
13019 }
13020 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 int kind;
13022 void *data;
13023
Georg Brandlceee0772007-11-27 23:48:05 +000013024 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013025 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013026 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13027 "to maketrans it must be a dict");
13028 goto err;
13029 }
13030 /* copy entries into the new dict, converting string keys to int keys */
13031 while (PyDict_Next(x, &i, &key, &value)) {
13032 if (PyUnicode_Check(key)) {
13033 /* convert string keys to integer keys */
13034 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013035 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013036 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13037 "table must be of length 1");
13038 goto err;
13039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 kind = PyUnicode_KIND(key);
13041 data = PyUnicode_DATA(key);
13042 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013043 if (!newkey)
13044 goto err;
13045 res = PyDict_SetItem(new, newkey, value);
13046 Py_DECREF(newkey);
13047 if (res < 0)
13048 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013049 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013050 /* just keep integer keys */
13051 if (PyDict_SetItem(new, key, value) < 0)
13052 goto err;
13053 } else {
13054 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13055 "be strings or integers");
13056 goto err;
13057 }
13058 }
13059 }
13060 return new;
13061 err:
13062 Py_DECREF(new);
13063 return NULL;
13064}
13065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013066PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068\n\
13069Return a copy of the string S, where all characters have been mapped\n\
13070through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013071Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013072Unmapped characters are left untouched. Characters mapped to None\n\
13073are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074
13075static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079}
13080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013081PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013084Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085
13086static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013087unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013089 if (PyUnicode_READY(self) == -1)
13090 return NULL;
13091 if (PyUnicode_IS_ASCII(self))
13092 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013093 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094}
13095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013096PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013097 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013099Pad a numeric string S with zeros on the left, to fill a field\n\
13100of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101
13102static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013103unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013105 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013106 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013107 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 int kind;
13109 void *data;
13110 Py_UCS4 chr;
13111
Martin v. Löwis18e16552006-02-15 17:27:45 +000013112 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113 return NULL;
13114
Benjamin Petersonbac79492012-01-14 13:34:47 -050013115 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
Victor Stinnerc4b49542011-12-11 22:44:26 +010013118 if (PyUnicode_GET_LENGTH(self) >= width)
13119 return unicode_result_unchanged(self);
13120
13121 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122
13123 u = pad(self, fill, 0, '0');
13124
Walter Dörwald068325e2002-04-15 13:36:47 +000013125 if (u == NULL)
13126 return NULL;
13127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 kind = PyUnicode_KIND(u);
13129 data = PyUnicode_DATA(u);
13130 chr = PyUnicode_READ(kind, data, fill);
13131
13132 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 PyUnicode_WRITE(kind, data, 0, chr);
13135 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 }
13137
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013138 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013139 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
13142#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013143static PyObject *
13144unicode__decimal2ascii(PyObject *self)
13145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013146 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013147}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148#endif
13149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013150PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013151 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013153Return True if S starts with the specified prefix, False otherwise.\n\
13154With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013155With optional end, stop comparing S at that position.\n\
13156prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157
13158static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013159unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013163 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013164 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013165 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013166 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167
Jesus Ceaac451502011-04-20 17:09:23 +020013168 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013169 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013170 if (PyTuple_Check(subobj)) {
13171 Py_ssize_t i;
13172 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013173 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013174 if (substring == NULL)
13175 return NULL;
13176 result = tailmatch(self, substring, start, end, -1);
13177 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013178 if (result == -1)
13179 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013180 if (result) {
13181 Py_RETURN_TRUE;
13182 }
13183 }
13184 /* nothing matched */
13185 Py_RETURN_FALSE;
13186 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013187 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013188 if (substring == NULL) {
13189 if (PyErr_ExceptionMatches(PyExc_TypeError))
13190 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13191 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013193 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013194 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013196 if (result == -1)
13197 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013198 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199}
13200
13201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013202PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013205Return True if S ends with the specified suffix, False otherwise.\n\
13206With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013207With optional end, stop comparing S at that position.\n\
13208suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209
13210static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013211unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013214 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013215 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013216 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013217 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013218 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219
Jesus Ceaac451502011-04-20 17:09:23 +020013220 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013221 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013222 if (PyTuple_Check(subobj)) {
13223 Py_ssize_t i;
13224 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013225 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013226 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013227 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013228 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013229 result = tailmatch(self, substring, start, end, +1);
13230 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013231 if (result == -1)
13232 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013233 if (result) {
13234 Py_RETURN_TRUE;
13235 }
13236 }
13237 Py_RETURN_FALSE;
13238 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013239 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013240 if (substring == NULL) {
13241 if (PyErr_ExceptionMatches(PyExc_TypeError))
13242 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13243 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013244 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013245 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013246 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013247 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013248 if (result == -1)
13249 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013250 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251}
13252
Victor Stinner202fdca2012-05-07 12:47:02 +020013253Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013254_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013255{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013256 if (!writer->readonly)
13257 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13258 else {
13259 /* Copy-on-write mode: set buffer size to 0 so
13260 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13261 * next write. */
13262 writer->size = 0;
13263 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013264 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13265 writer->data = PyUnicode_DATA(writer->buffer);
13266 writer->kind = PyUnicode_KIND(writer->buffer);
13267}
13268
Victor Stinnerd3f08822012-05-29 12:57:52 +020013269void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013270_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013271{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013272 memset(writer, 0, sizeof(*writer));
13273#ifdef Py_DEBUG
13274 writer->kind = 5; /* invalid kind */
13275#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013276 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013277}
13278
Victor Stinnerd3f08822012-05-29 12:57:52 +020013279int
13280_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13281 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013282{
Victor Stinner6989ba02013-11-18 21:08:39 +010013283#ifdef MS_WINDOWS
13284 /* On Windows, overallocate by 50% is the best factor */
13285# define OVERALLOCATE_FACTOR 2
13286#else
13287 /* On Linux, overallocate by 25% is the best factor */
13288# define OVERALLOCATE_FACTOR 4
13289#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013290 Py_ssize_t newlen;
13291 PyObject *newbuffer;
13292
Victor Stinnerd3f08822012-05-29 12:57:52 +020013293 assert(length > 0);
13294
Victor Stinner202fdca2012-05-07 12:47:02 +020013295 if (length > PY_SSIZE_T_MAX - writer->pos) {
13296 PyErr_NoMemory();
13297 return -1;
13298 }
13299 newlen = writer->pos + length;
13300
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013301 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013302
Victor Stinnerd3f08822012-05-29 12:57:52 +020013303 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013304 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013305 if (writer->overallocate
13306 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13307 /* overallocate to limit the number of realloc() */
13308 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013309 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013310 if (newlen < writer->min_length)
13311 newlen = writer->min_length;
13312
Victor Stinnerd3f08822012-05-29 12:57:52 +020013313 writer->buffer = PyUnicode_New(newlen, maxchar);
13314 if (writer->buffer == NULL)
13315 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013316 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013317 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013318 if (writer->overallocate
13319 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13320 /* overallocate to limit the number of realloc() */
13321 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013322 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013323 if (newlen < writer->min_length)
13324 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013325
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013326 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013327 /* resize + widen */
13328 newbuffer = PyUnicode_New(newlen, maxchar);
13329 if (newbuffer == NULL)
13330 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013331 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13332 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013333 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013334 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013335 }
13336 else {
13337 newbuffer = resize_compact(writer->buffer, newlen);
13338 if (newbuffer == NULL)
13339 return -1;
13340 }
13341 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013342 }
13343 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013344 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013345 newbuffer = PyUnicode_New(writer->size, maxchar);
13346 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013347 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013348 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13349 writer->buffer, 0, writer->pos);
13350 Py_DECREF(writer->buffer);
13351 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013352 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013353 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013354 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013355
13356#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013357}
13358
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013359Py_LOCAL_INLINE(int)
13360_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013361{
13362 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13363 return -1;
13364 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13365 writer->pos++;
13366 return 0;
13367}
13368
13369int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013370_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13371{
13372 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13373}
13374
13375int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013376_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13377{
13378 Py_UCS4 maxchar;
13379 Py_ssize_t len;
13380
13381 if (PyUnicode_READY(str) == -1)
13382 return -1;
13383 len = PyUnicode_GET_LENGTH(str);
13384 if (len == 0)
13385 return 0;
13386 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13387 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013388 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013389 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013390 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013391 Py_INCREF(str);
13392 writer->buffer = str;
13393 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013394 writer->pos += len;
13395 return 0;
13396 }
13397 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13398 return -1;
13399 }
13400 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13401 str, 0, len);
13402 writer->pos += len;
13403 return 0;
13404}
13405
Victor Stinnere215d962012-10-06 23:03:36 +020013406int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013407_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13408 Py_ssize_t start, Py_ssize_t end)
13409{
13410 Py_UCS4 maxchar;
13411 Py_ssize_t len;
13412
13413 if (PyUnicode_READY(str) == -1)
13414 return -1;
13415
13416 assert(0 <= start);
13417 assert(end <= PyUnicode_GET_LENGTH(str));
13418 assert(start <= end);
13419
13420 if (end == 0)
13421 return 0;
13422
13423 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13424 return _PyUnicodeWriter_WriteStr(writer, str);
13425
13426 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13427 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13428 else
13429 maxchar = writer->maxchar;
13430 len = end - start;
13431
13432 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13433 return -1;
13434
13435 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13436 str, start, len);
13437 writer->pos += len;
13438 return 0;
13439}
13440
13441int
Victor Stinner4a587072013-11-19 12:54:53 +010013442_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13443 const char *ascii, Py_ssize_t len)
13444{
13445 if (len == -1)
13446 len = strlen(ascii);
13447
13448 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13449
13450 if (writer->buffer == NULL && !writer->overallocate) {
13451 PyObject *str;
13452
13453 str = _PyUnicode_FromASCII(ascii, len);
13454 if (str == NULL)
13455 return -1;
13456
13457 writer->readonly = 1;
13458 writer->buffer = str;
13459 _PyUnicodeWriter_Update(writer);
13460 writer->pos += len;
13461 return 0;
13462 }
13463
13464 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13465 return -1;
13466
13467 switch (writer->kind)
13468 {
13469 case PyUnicode_1BYTE_KIND:
13470 {
13471 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13472 Py_UCS1 *data = writer->data;
13473
13474 Py_MEMCPY(data + writer->pos, str, len);
13475 break;
13476 }
13477 case PyUnicode_2BYTE_KIND:
13478 {
13479 _PyUnicode_CONVERT_BYTES(
13480 Py_UCS1, Py_UCS2,
13481 ascii, ascii + len,
13482 (Py_UCS2 *)writer->data + writer->pos);
13483 break;
13484 }
13485 case PyUnicode_4BYTE_KIND:
13486 {
13487 _PyUnicode_CONVERT_BYTES(
13488 Py_UCS1, Py_UCS4,
13489 ascii, ascii + len,
13490 (Py_UCS4 *)writer->data + writer->pos);
13491 break;
13492 }
13493 default:
13494 assert(0);
13495 }
13496
13497 writer->pos += len;
13498 return 0;
13499}
13500
13501int
13502_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13503 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013504{
13505 Py_UCS4 maxchar;
13506
13507 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13508 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13509 return -1;
13510 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13511 writer->pos += len;
13512 return 0;
13513}
13514
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013516_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013517{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013518 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013520 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013521 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013523 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013524 str = writer->buffer;
13525 writer->buffer = NULL;
13526 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13527 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013528 }
13529 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13530 PyObject *newbuffer;
13531 newbuffer = resize_compact(writer->buffer, writer->pos);
13532 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013533 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013534 return NULL;
13535 }
13536 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013537 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013538 str = writer->buffer;
13539 writer->buffer = NULL;
13540 assert(_PyUnicode_CheckConsistency(str, 1));
13541 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013542}
13543
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013545_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013546{
13547 Py_CLEAR(writer->buffer);
13548}
13549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013550#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013551
13552PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013554\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013555Return a formatted version of S, using substitutions from args and kwargs.\n\
13556The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013557
Eric Smith27bbca62010-11-04 17:06:58 +000013558PyDoc_STRVAR(format_map__doc__,
13559 "S.format_map(mapping) -> str\n\
13560\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013561Return a formatted version of S, using substitutions from mapping.\n\
13562The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013563
Eric Smith4a7d76d2008-05-30 18:10:19 +000013564static PyObject *
13565unicode__format__(PyObject* self, PyObject* args)
13566{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567 PyObject *format_spec;
13568 _PyUnicodeWriter writer;
13569 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013570
13571 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13572 return NULL;
13573
Victor Stinnerd3f08822012-05-29 12:57:52 +020013574 if (PyUnicode_READY(self) == -1)
13575 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013576 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13578 self, format_spec, 0,
13579 PyUnicode_GET_LENGTH(format_spec));
13580 if (ret == -1) {
13581 _PyUnicodeWriter_Dealloc(&writer);
13582 return NULL;
13583 }
13584 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013585}
13586
Eric Smith8c663262007-08-25 02:26:07 +000013587PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013589\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013590Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013591
13592static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013593unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013594{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013595 Py_ssize_t size;
13596
13597 /* If it's a compact object, account for base structure +
13598 character data. */
13599 if (PyUnicode_IS_COMPACT_ASCII(v))
13600 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13601 else if (PyUnicode_IS_COMPACT(v))
13602 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013603 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 else {
13605 /* If it is a two-block object, account for base object, and
13606 for character block if present. */
13607 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013608 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013609 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013610 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 }
13612 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013613 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013614 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013615 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013616 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013617 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013618
13619 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013620}
13621
13622PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013623 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013624
13625static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013626unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013627{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013628 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629 if (!copy)
13630 return NULL;
13631 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013632}
13633
Guido van Rossumd57fd912000-03-10 22:53:23 +000013634static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013635 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013636 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013637 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13638 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013639 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13640 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013641 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013642 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13643 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13644 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013645 {"expandtabs", (PyCFunction) unicode_expandtabs,
13646 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013647 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013648 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013649 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13650 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13651 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013652 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013653 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13654 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13655 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013656 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013657 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013658 {"splitlines", (PyCFunction) unicode_splitlines,
13659 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013660 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013661 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13662 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13663 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13664 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13665 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13666 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13667 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13668 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13669 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13670 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13671 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13672 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13673 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13674 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013675 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013676 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013677 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013678 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013679 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013680 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013681 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013682 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013683#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013684 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013685 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686#endif
13687
Benjamin Peterson14339b62009-01-31 16:36:08 +000013688 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689 {NULL, NULL}
13690};
13691
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013692static PyObject *
13693unicode_mod(PyObject *v, PyObject *w)
13694{
Brian Curtindfc80e32011-08-10 20:28:54 -050013695 if (!PyUnicode_Check(v))
13696 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013698}
13699
13700static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013701 0, /*nb_add*/
13702 0, /*nb_subtract*/
13703 0, /*nb_multiply*/
13704 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013705};
13706
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013708 (lenfunc) unicode_length, /* sq_length */
13709 PyUnicode_Concat, /* sq_concat */
13710 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13711 (ssizeargfunc) unicode_getitem, /* sq_item */
13712 0, /* sq_slice */
13713 0, /* sq_ass_item */
13714 0, /* sq_ass_slice */
13715 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013716};
13717
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013718static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013719unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013720{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013721 if (PyUnicode_READY(self) == -1)
13722 return NULL;
13723
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013724 if (PyIndex_Check(item)) {
13725 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013726 if (i == -1 && PyErr_Occurred())
13727 return NULL;
13728 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013729 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013730 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013731 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013732 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013733 PyObject *result;
13734 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013735 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013736 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013738 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013740 return NULL;
13741 }
13742
13743 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013744 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013746 slicelength == PyUnicode_GET_LENGTH(self)) {
13747 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013748 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013749 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013750 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013751 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013752 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013753 src_kind = PyUnicode_KIND(self);
13754 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013755 if (!PyUnicode_IS_ASCII(self)) {
13756 kind_limit = kind_maxchar_limit(src_kind);
13757 max_char = 0;
13758 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13759 ch = PyUnicode_READ(src_kind, src_data, cur);
13760 if (ch > max_char) {
13761 max_char = ch;
13762 if (max_char >= kind_limit)
13763 break;
13764 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013765 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013766 }
Victor Stinner55c99112011-10-13 01:17:06 +020013767 else
13768 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013769 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013770 if (result == NULL)
13771 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013772 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013773 dest_data = PyUnicode_DATA(result);
13774
13775 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013776 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13777 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013778 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013779 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013780 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013781 } else {
13782 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13783 return NULL;
13784 }
13785}
13786
13787static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013788 (lenfunc)unicode_length, /* mp_length */
13789 (binaryfunc)unicode_subscript, /* mp_subscript */
13790 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013791};
13792
Guido van Rossumd57fd912000-03-10 22:53:23 +000013793
Guido van Rossumd57fd912000-03-10 22:53:23 +000013794/* Helpers for PyUnicode_Format() */
13795
Victor Stinnera47082312012-10-04 02:19:54 +020013796struct unicode_formatter_t {
13797 PyObject *args;
13798 int args_owned;
13799 Py_ssize_t arglen, argidx;
13800 PyObject *dict;
13801
13802 enum PyUnicode_Kind fmtkind;
13803 Py_ssize_t fmtcnt, fmtpos;
13804 void *fmtdata;
13805 PyObject *fmtstr;
13806
13807 _PyUnicodeWriter writer;
13808};
13809
13810struct unicode_format_arg_t {
13811 Py_UCS4 ch;
13812 int flags;
13813 Py_ssize_t width;
13814 int prec;
13815 int sign;
13816};
13817
Guido van Rossumd57fd912000-03-10 22:53:23 +000013818static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013819unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820{
Victor Stinnera47082312012-10-04 02:19:54 +020013821 Py_ssize_t argidx = ctx->argidx;
13822
13823 if (argidx < ctx->arglen) {
13824 ctx->argidx++;
13825 if (ctx->arglen < 0)
13826 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013827 else
Victor Stinnera47082312012-10-04 02:19:54 +020013828 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829 }
13830 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832 return NULL;
13833}
13834
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013835/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836
Victor Stinnera47082312012-10-04 02:19:54 +020013837/* Format a float into the writer if the writer is not NULL, or into *p_output
13838 otherwise.
13839
13840 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841static int
Victor Stinnera47082312012-10-04 02:19:54 +020013842formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13843 PyObject **p_output,
13844 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013846 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013848 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013849 int prec;
13850 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013851
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852 x = PyFloat_AsDouble(v);
13853 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013855
Victor Stinnera47082312012-10-04 02:19:54 +020013856 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013857 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013858 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013859
Victor Stinnera47082312012-10-04 02:19:54 +020013860 if (arg->flags & F_ALT)
13861 dtoa_flags = Py_DTSF_ALT;
13862 else
13863 dtoa_flags = 0;
13864 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013865 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013866 return -1;
13867 len = strlen(p);
13868 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013869 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013870 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013871 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013872 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873 }
13874 else
13875 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013876 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013877 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013878}
13879
Victor Stinnerd0880d52012-04-27 23:40:13 +020013880/* formatlong() emulates the format codes d, u, o, x and X, and
13881 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13882 * Python's regular ints.
13883 * Return value: a new PyUnicodeObject*, or NULL if error.
13884 * The output string is of the form
13885 * "-"? ("0x" | "0X")? digit+
13886 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13887 * set in flags. The case of hex digits will be correct,
13888 * There will be at least prec digits, zero-filled on the left if
13889 * necessary to get that many.
13890 * val object to be converted
13891 * flags bitmask of format flags; only F_ALT is looked at
13892 * prec minimum number of digits; 0-fill on left if needed
13893 * type a character in [duoxX]; u acts the same as d
13894 *
13895 * CAUTION: o, x and X conversions on regular ints can never
13896 * produce a '-' sign, but can for Python's unbounded ints.
13897 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013898PyObject *
13899_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013900{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013901 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013902 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013903 Py_ssize_t i;
13904 int sign; /* 1 if '-', else 0 */
13905 int len; /* number of characters */
13906 Py_ssize_t llen;
13907 int numdigits; /* len == numnondigits + numdigits */
13908 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013909
Victor Stinnerd0880d52012-04-27 23:40:13 +020013910 /* Avoid exceeding SSIZE_T_MAX */
13911 if (prec > INT_MAX-3) {
13912 PyErr_SetString(PyExc_OverflowError,
13913 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013914 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013915 }
13916
13917 assert(PyLong_Check(val));
13918
13919 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013920 default:
13921 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013922 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013923 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013924 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013925 /* int and int subclasses should print numerically when a numeric */
13926 /* format code is used (see issue18780) */
13927 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013928 break;
13929 case 'o':
13930 numnondigits = 2;
13931 result = PyNumber_ToBase(val, 8);
13932 break;
13933 case 'x':
13934 case 'X':
13935 numnondigits = 2;
13936 result = PyNumber_ToBase(val, 16);
13937 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013938 }
13939 if (!result)
13940 return NULL;
13941
13942 assert(unicode_modifiable(result));
13943 assert(PyUnicode_IS_READY(result));
13944 assert(PyUnicode_IS_ASCII(result));
13945
13946 /* To modify the string in-place, there can only be one reference. */
13947 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013948 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013949 PyErr_BadInternalCall();
13950 return NULL;
13951 }
13952 buf = PyUnicode_DATA(result);
13953 llen = PyUnicode_GET_LENGTH(result);
13954 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013955 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013956 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013957 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013958 return NULL;
13959 }
13960 len = (int)llen;
13961 sign = buf[0] == '-';
13962 numnondigits += sign;
13963 numdigits = len - numnondigits;
13964 assert(numdigits > 0);
13965
13966 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013967 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013968 (type == 'o' || type == 'x' || type == 'X'))) {
13969 assert(buf[sign] == '0');
13970 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13971 buf[sign+1] == 'o');
13972 numnondigits -= 2;
13973 buf += 2;
13974 len -= 2;
13975 if (sign)
13976 buf[0] = '-';
13977 assert(len == numnondigits + numdigits);
13978 assert(numdigits > 0);
13979 }
13980
13981 /* Fill with leading zeroes to meet minimum width. */
13982 if (prec > numdigits) {
13983 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13984 numnondigits + prec);
13985 char *b1;
13986 if (!r1) {
13987 Py_DECREF(result);
13988 return NULL;
13989 }
13990 b1 = PyBytes_AS_STRING(r1);
13991 for (i = 0; i < numnondigits; ++i)
13992 *b1++ = *buf++;
13993 for (i = 0; i < prec - numdigits; i++)
13994 *b1++ = '0';
13995 for (i = 0; i < numdigits; i++)
13996 *b1++ = *buf++;
13997 *b1 = '\0';
13998 Py_DECREF(result);
13999 result = r1;
14000 buf = PyBytes_AS_STRING(result);
14001 len = numnondigits + prec;
14002 }
14003
14004 /* Fix up case for hex conversions. */
14005 if (type == 'X') {
14006 /* Need to convert all lower case letters to upper case.
14007 and need to convert 0x to 0X (and -0x to -0X). */
14008 for (i = 0; i < len; i++)
14009 if (buf[i] >= 'a' && buf[i] <= 'x')
14010 buf[i] -= 'a'-'A';
14011 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014012 if (!PyUnicode_Check(result)
14013 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014014 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014015 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014016 Py_DECREF(result);
14017 result = unicode;
14018 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014019 else if (len != PyUnicode_GET_LENGTH(result)) {
14020 if (PyUnicode_Resize(&result, len) < 0)
14021 Py_CLEAR(result);
14022 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014024}
14025
Ethan Furmandf3ed242014-01-05 06:50:30 -080014026/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014027 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014028 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014029 * -1 and raise an exception on error */
14030static int
Victor Stinnera47082312012-10-04 02:19:54 +020014031mainformatlong(PyObject *v,
14032 struct unicode_format_arg_t *arg,
14033 PyObject **p_output,
14034 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014035{
14036 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014037 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014038
14039 if (!PyNumber_Check(v))
14040 goto wrongtype;
14041
Ethan Furman9ab74802014-03-21 06:38:46 -070014042 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014043 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014044 if (type == 'o' || type == 'x' || type == 'X') {
14045 iobj = PyNumber_Index(v);
14046 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014047 if (PyErr_ExceptionMatches(PyExc_TypeError))
14048 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014049 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014050 }
14051 }
14052 else {
14053 iobj = PyNumber_Long(v);
14054 if (iobj == NULL ) {
14055 if (PyErr_ExceptionMatches(PyExc_TypeError))
14056 goto wrongtype;
14057 return -1;
14058 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014059 }
14060 assert(PyLong_Check(iobj));
14061 }
14062 else {
14063 iobj = v;
14064 Py_INCREF(iobj);
14065 }
14066
14067 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014068 && arg->width == -1 && arg->prec == -1
14069 && !(arg->flags & (F_SIGN | F_BLANK))
14070 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014071 {
14072 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014073 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014074 int base;
14075
Victor Stinnera47082312012-10-04 02:19:54 +020014076 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014077 {
14078 default:
14079 assert(0 && "'type' not in [diuoxX]");
14080 case 'd':
14081 case 'i':
14082 case 'u':
14083 base = 10;
14084 break;
14085 case 'o':
14086 base = 8;
14087 break;
14088 case 'x':
14089 case 'X':
14090 base = 16;
14091 break;
14092 }
14093
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014094 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14095 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014097 }
14098 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014099 return 1;
14100 }
14101
Ethan Furmanb95b5612015-01-23 20:05:18 -080014102 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014103 Py_DECREF(iobj);
14104 if (res == NULL)
14105 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014106 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014107 return 0;
14108
14109wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014110 switch(type)
14111 {
14112 case 'o':
14113 case 'x':
14114 case 'X':
14115 PyErr_Format(PyExc_TypeError,
14116 "%%%c format: an integer is required, "
14117 "not %.200s",
14118 type, Py_TYPE(v)->tp_name);
14119 break;
14120 default:
14121 PyErr_Format(PyExc_TypeError,
14122 "%%%c format: a number is required, "
14123 "not %.200s",
14124 type, Py_TYPE(v)->tp_name);
14125 break;
14126 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014127 return -1;
14128}
14129
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014130static Py_UCS4
14131formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014132{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014133 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014134 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014135 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014136 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014138 goto onError;
14139 }
14140 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014141 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014142 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014143 /* make sure number is a type of integer */
14144 if (!PyLong_Check(v)) {
14145 iobj = PyNumber_Index(v);
14146 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014147 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014148 }
14149 v = iobj;
14150 Py_DECREF(iobj);
14151 }
14152 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014153 x = PyLong_AsLong(v);
14154 if (x == -1 && PyErr_Occurred())
14155 goto onError;
14156
Victor Stinner8faf8212011-12-08 22:14:11 +010014157 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014158 PyErr_SetString(PyExc_OverflowError,
14159 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014160 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014161 }
14162
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014163 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014165
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014167 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014168 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014169 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014170}
14171
Victor Stinnera47082312012-10-04 02:19:54 +020014172/* Parse options of an argument: flags, width, precision.
14173 Handle also "%(name)" syntax.
14174
14175 Return 0 if the argument has been formatted into arg->str.
14176 Return 1 if the argument has been written into ctx->writer,
14177 Raise an exception and return -1 on error. */
14178static int
14179unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14180 struct unicode_format_arg_t *arg)
14181{
14182#define FORMAT_READ(ctx) \
14183 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14184
14185 PyObject *v;
14186
Victor Stinnera47082312012-10-04 02:19:54 +020014187 if (arg->ch == '(') {
14188 /* Get argument value from a dictionary. Example: "%(name)s". */
14189 Py_ssize_t keystart;
14190 Py_ssize_t keylen;
14191 PyObject *key;
14192 int pcount = 1;
14193
14194 if (ctx->dict == NULL) {
14195 PyErr_SetString(PyExc_TypeError,
14196 "format requires a mapping");
14197 return -1;
14198 }
14199 ++ctx->fmtpos;
14200 --ctx->fmtcnt;
14201 keystart = ctx->fmtpos;
14202 /* Skip over balanced parentheses */
14203 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14204 arg->ch = FORMAT_READ(ctx);
14205 if (arg->ch == ')')
14206 --pcount;
14207 else if (arg->ch == '(')
14208 ++pcount;
14209 ctx->fmtpos++;
14210 }
14211 keylen = ctx->fmtpos - keystart - 1;
14212 if (ctx->fmtcnt < 0 || pcount > 0) {
14213 PyErr_SetString(PyExc_ValueError,
14214 "incomplete format key");
14215 return -1;
14216 }
14217 key = PyUnicode_Substring(ctx->fmtstr,
14218 keystart, keystart + keylen);
14219 if (key == NULL)
14220 return -1;
14221 if (ctx->args_owned) {
14222 Py_DECREF(ctx->args);
14223 ctx->args_owned = 0;
14224 }
14225 ctx->args = PyObject_GetItem(ctx->dict, key);
14226 Py_DECREF(key);
14227 if (ctx->args == NULL)
14228 return -1;
14229 ctx->args_owned = 1;
14230 ctx->arglen = -1;
14231 ctx->argidx = -2;
14232 }
14233
14234 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014235 while (--ctx->fmtcnt >= 0) {
14236 arg->ch = FORMAT_READ(ctx);
14237 ctx->fmtpos++;
14238 switch (arg->ch) {
14239 case '-': arg->flags |= F_LJUST; continue;
14240 case '+': arg->flags |= F_SIGN; continue;
14241 case ' ': arg->flags |= F_BLANK; continue;
14242 case '#': arg->flags |= F_ALT; continue;
14243 case '0': arg->flags |= F_ZERO; continue;
14244 }
14245 break;
14246 }
14247
14248 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014249 if (arg->ch == '*') {
14250 v = unicode_format_getnextarg(ctx);
14251 if (v == NULL)
14252 return -1;
14253 if (!PyLong_Check(v)) {
14254 PyErr_SetString(PyExc_TypeError,
14255 "* wants int");
14256 return -1;
14257 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014258 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014259 if (arg->width == -1 && PyErr_Occurred())
14260 return -1;
14261 if (arg->width < 0) {
14262 arg->flags |= F_LJUST;
14263 arg->width = -arg->width;
14264 }
14265 if (--ctx->fmtcnt >= 0) {
14266 arg->ch = FORMAT_READ(ctx);
14267 ctx->fmtpos++;
14268 }
14269 }
14270 else if (arg->ch >= '0' && arg->ch <= '9') {
14271 arg->width = arg->ch - '0';
14272 while (--ctx->fmtcnt >= 0) {
14273 arg->ch = FORMAT_READ(ctx);
14274 ctx->fmtpos++;
14275 if (arg->ch < '0' || arg->ch > '9')
14276 break;
14277 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14278 mixing signed and unsigned comparison. Since arg->ch is between
14279 '0' and '9', casting to int is safe. */
14280 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14281 PyErr_SetString(PyExc_ValueError,
14282 "width too big");
14283 return -1;
14284 }
14285 arg->width = arg->width*10 + (arg->ch - '0');
14286 }
14287 }
14288
14289 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014290 if (arg->ch == '.') {
14291 arg->prec = 0;
14292 if (--ctx->fmtcnt >= 0) {
14293 arg->ch = FORMAT_READ(ctx);
14294 ctx->fmtpos++;
14295 }
14296 if (arg->ch == '*') {
14297 v = unicode_format_getnextarg(ctx);
14298 if (v == NULL)
14299 return -1;
14300 if (!PyLong_Check(v)) {
14301 PyErr_SetString(PyExc_TypeError,
14302 "* wants int");
14303 return -1;
14304 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014305 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014306 if (arg->prec == -1 && PyErr_Occurred())
14307 return -1;
14308 if (arg->prec < 0)
14309 arg->prec = 0;
14310 if (--ctx->fmtcnt >= 0) {
14311 arg->ch = FORMAT_READ(ctx);
14312 ctx->fmtpos++;
14313 }
14314 }
14315 else if (arg->ch >= '0' && arg->ch <= '9') {
14316 arg->prec = arg->ch - '0';
14317 while (--ctx->fmtcnt >= 0) {
14318 arg->ch = FORMAT_READ(ctx);
14319 ctx->fmtpos++;
14320 if (arg->ch < '0' || arg->ch > '9')
14321 break;
14322 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14323 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014324 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014325 return -1;
14326 }
14327 arg->prec = arg->prec*10 + (arg->ch - '0');
14328 }
14329 }
14330 }
14331
14332 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14333 if (ctx->fmtcnt >= 0) {
14334 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14335 if (--ctx->fmtcnt >= 0) {
14336 arg->ch = FORMAT_READ(ctx);
14337 ctx->fmtpos++;
14338 }
14339 }
14340 }
14341 if (ctx->fmtcnt < 0) {
14342 PyErr_SetString(PyExc_ValueError,
14343 "incomplete format");
14344 return -1;
14345 }
14346 return 0;
14347
14348#undef FORMAT_READ
14349}
14350
14351/* Format one argument. Supported conversion specifiers:
14352
14353 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014354 - "i", "d", "u": int or float
14355 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014356 - "e", "E", "f", "F", "g", "G": float
14357 - "c": int or str (1 character)
14358
Victor Stinner8dbd4212012-12-04 09:30:24 +010014359 When possible, the output is written directly into the Unicode writer
14360 (ctx->writer). A string is created when padding is required.
14361
Victor Stinnera47082312012-10-04 02:19:54 +020014362 Return 0 if the argument has been formatted into *p_str,
14363 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014364 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014365static int
14366unicode_format_arg_format(struct unicode_formatter_t *ctx,
14367 struct unicode_format_arg_t *arg,
14368 PyObject **p_str)
14369{
14370 PyObject *v;
14371 _PyUnicodeWriter *writer = &ctx->writer;
14372
14373 if (ctx->fmtcnt == 0)
14374 ctx->writer.overallocate = 0;
14375
14376 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014377 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014378 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014379 return 1;
14380 }
14381
14382 v = unicode_format_getnextarg(ctx);
14383 if (v == NULL)
14384 return -1;
14385
Victor Stinnera47082312012-10-04 02:19:54 +020014386
14387 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014388 case 's':
14389 case 'r':
14390 case 'a':
14391 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14392 /* Fast path */
14393 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14394 return -1;
14395 return 1;
14396 }
14397
14398 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14399 *p_str = v;
14400 Py_INCREF(*p_str);
14401 }
14402 else {
14403 if (arg->ch == 's')
14404 *p_str = PyObject_Str(v);
14405 else if (arg->ch == 'r')
14406 *p_str = PyObject_Repr(v);
14407 else
14408 *p_str = PyObject_ASCII(v);
14409 }
14410 break;
14411
14412 case 'i':
14413 case 'd':
14414 case 'u':
14415 case 'o':
14416 case 'x':
14417 case 'X':
14418 {
14419 int ret = mainformatlong(v, arg, p_str, writer);
14420 if (ret != 0)
14421 return ret;
14422 arg->sign = 1;
14423 break;
14424 }
14425
14426 case 'e':
14427 case 'E':
14428 case 'f':
14429 case 'F':
14430 case 'g':
14431 case 'G':
14432 if (arg->width == -1 && arg->prec == -1
14433 && !(arg->flags & (F_SIGN | F_BLANK)))
14434 {
14435 /* Fast path */
14436 if (formatfloat(v, arg, NULL, writer) == -1)
14437 return -1;
14438 return 1;
14439 }
14440
14441 arg->sign = 1;
14442 if (formatfloat(v, arg, p_str, NULL) == -1)
14443 return -1;
14444 break;
14445
14446 case 'c':
14447 {
14448 Py_UCS4 ch = formatchar(v);
14449 if (ch == (Py_UCS4) -1)
14450 return -1;
14451 if (arg->width == -1 && arg->prec == -1) {
14452 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014453 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014454 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014455 return 1;
14456 }
14457 *p_str = PyUnicode_FromOrdinal(ch);
14458 break;
14459 }
14460
14461 default:
14462 PyErr_Format(PyExc_ValueError,
14463 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014464 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014465 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14466 (int)arg->ch,
14467 ctx->fmtpos - 1);
14468 return -1;
14469 }
14470 if (*p_str == NULL)
14471 return -1;
14472 assert (PyUnicode_Check(*p_str));
14473 return 0;
14474}
14475
14476static int
14477unicode_format_arg_output(struct unicode_formatter_t *ctx,
14478 struct unicode_format_arg_t *arg,
14479 PyObject *str)
14480{
14481 Py_ssize_t len;
14482 enum PyUnicode_Kind kind;
14483 void *pbuf;
14484 Py_ssize_t pindex;
14485 Py_UCS4 signchar;
14486 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014487 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014488 Py_ssize_t sublen;
14489 _PyUnicodeWriter *writer = &ctx->writer;
14490 Py_UCS4 fill;
14491
14492 fill = ' ';
14493 if (arg->sign && arg->flags & F_ZERO)
14494 fill = '0';
14495
14496 if (PyUnicode_READY(str) == -1)
14497 return -1;
14498
14499 len = PyUnicode_GET_LENGTH(str);
14500 if ((arg->width == -1 || arg->width <= len)
14501 && (arg->prec == -1 || arg->prec >= len)
14502 && !(arg->flags & (F_SIGN | F_BLANK)))
14503 {
14504 /* Fast path */
14505 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14506 return -1;
14507 return 0;
14508 }
14509
14510 /* Truncate the string for "s", "r" and "a" formats
14511 if the precision is set */
14512 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14513 if (arg->prec >= 0 && len > arg->prec)
14514 len = arg->prec;
14515 }
14516
14517 /* Adjust sign and width */
14518 kind = PyUnicode_KIND(str);
14519 pbuf = PyUnicode_DATA(str);
14520 pindex = 0;
14521 signchar = '\0';
14522 if (arg->sign) {
14523 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14524 if (ch == '-' || ch == '+') {
14525 signchar = ch;
14526 len--;
14527 pindex++;
14528 }
14529 else if (arg->flags & F_SIGN)
14530 signchar = '+';
14531 else if (arg->flags & F_BLANK)
14532 signchar = ' ';
14533 else
14534 arg->sign = 0;
14535 }
14536 if (arg->width < len)
14537 arg->width = len;
14538
14539 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014540 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014541 if (!(arg->flags & F_LJUST)) {
14542 if (arg->sign) {
14543 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014544 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014545 }
14546 else {
14547 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014548 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014549 }
14550 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014551 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14552 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014553 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014554 }
14555
Victor Stinnera47082312012-10-04 02:19:54 +020014556 buflen = arg->width;
14557 if (arg->sign && len == arg->width)
14558 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014559 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014560 return -1;
14561
14562 /* Write the sign if needed */
14563 if (arg->sign) {
14564 if (fill != ' ') {
14565 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14566 writer->pos += 1;
14567 }
14568 if (arg->width > len)
14569 arg->width--;
14570 }
14571
14572 /* Write the numeric prefix for "x", "X" and "o" formats
14573 if the alternate form is used.
14574 For example, write "0x" for the "%#x" format. */
14575 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14576 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14577 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14578 if (fill != ' ') {
14579 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14580 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14581 writer->pos += 2;
14582 pindex += 2;
14583 }
14584 arg->width -= 2;
14585 if (arg->width < 0)
14586 arg->width = 0;
14587 len -= 2;
14588 }
14589
14590 /* Pad left with the fill character if needed */
14591 if (arg->width > len && !(arg->flags & F_LJUST)) {
14592 sublen = arg->width - len;
14593 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14594 writer->pos += sublen;
14595 arg->width = len;
14596 }
14597
14598 /* If padding with spaces: write sign if needed and/or numeric prefix if
14599 the alternate form is used */
14600 if (fill == ' ') {
14601 if (arg->sign) {
14602 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14603 writer->pos += 1;
14604 }
14605 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14606 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14607 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14608 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14609 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14610 writer->pos += 2;
14611 pindex += 2;
14612 }
14613 }
14614
14615 /* Write characters */
14616 if (len) {
14617 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14618 str, pindex, len);
14619 writer->pos += len;
14620 }
14621
14622 /* Pad right with the fill character if needed */
14623 if (arg->width > len) {
14624 sublen = arg->width - len;
14625 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14626 writer->pos += sublen;
14627 }
14628 return 0;
14629}
14630
14631/* Helper of PyUnicode_Format(): format one arg.
14632 Return 0 on success, raise an exception and return -1 on error. */
14633static int
14634unicode_format_arg(struct unicode_formatter_t *ctx)
14635{
14636 struct unicode_format_arg_t arg;
14637 PyObject *str;
14638 int ret;
14639
Victor Stinner8dbd4212012-12-04 09:30:24 +010014640 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14641 arg.flags = 0;
14642 arg.width = -1;
14643 arg.prec = -1;
14644 arg.sign = 0;
14645 str = NULL;
14646
Victor Stinnera47082312012-10-04 02:19:54 +020014647 ret = unicode_format_arg_parse(ctx, &arg);
14648 if (ret == -1)
14649 return -1;
14650
14651 ret = unicode_format_arg_format(ctx, &arg, &str);
14652 if (ret == -1)
14653 return -1;
14654
14655 if (ret != 1) {
14656 ret = unicode_format_arg_output(ctx, &arg, str);
14657 Py_DECREF(str);
14658 if (ret == -1)
14659 return -1;
14660 }
14661
14662 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14663 PyErr_SetString(PyExc_TypeError,
14664 "not all arguments converted during string formatting");
14665 return -1;
14666 }
14667 return 0;
14668}
14669
Alexander Belopolsky40018472011-02-26 01:02:56 +000014670PyObject *
14671PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014672{
Victor Stinnera47082312012-10-04 02:19:54 +020014673 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014674
Guido van Rossumd57fd912000-03-10 22:53:23 +000014675 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014676 PyErr_BadInternalCall();
14677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014678 }
Victor Stinnera47082312012-10-04 02:19:54 +020014679
14680 ctx.fmtstr = PyUnicode_FromObject(format);
14681 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014682 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014683 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14684 Py_DECREF(ctx.fmtstr);
14685 return NULL;
14686 }
14687 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14688 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14689 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14690 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014691
Victor Stinner8f674cc2013-04-17 23:02:17 +020014692 _PyUnicodeWriter_Init(&ctx.writer);
14693 ctx.writer.min_length = ctx.fmtcnt + 100;
14694 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014695
Guido van Rossumd57fd912000-03-10 22:53:23 +000014696 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014697 ctx.arglen = PyTuple_Size(args);
14698 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014699 }
14700 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014701 ctx.arglen = -1;
14702 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014703 }
Victor Stinnera47082312012-10-04 02:19:54 +020014704 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014705 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014706 ctx.dict = args;
14707 else
14708 ctx.dict = NULL;
14709 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014710
Victor Stinnera47082312012-10-04 02:19:54 +020014711 while (--ctx.fmtcnt >= 0) {
14712 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014713 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014714
14715 nonfmtpos = ctx.fmtpos++;
14716 while (ctx.fmtcnt >= 0 &&
14717 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14718 ctx.fmtpos++;
14719 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014720 }
Victor Stinnera47082312012-10-04 02:19:54 +020014721 if (ctx.fmtcnt < 0) {
14722 ctx.fmtpos--;
14723 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014724 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014725
Victor Stinnercfc4c132013-04-03 01:48:39 +020014726 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14727 nonfmtpos, ctx.fmtpos) < 0)
14728 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014729 }
14730 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014731 ctx.fmtpos++;
14732 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014733 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014734 }
14735 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014736
Victor Stinnera47082312012-10-04 02:19:54 +020014737 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014738 PyErr_SetString(PyExc_TypeError,
14739 "not all arguments converted during string formatting");
14740 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014741 }
14742
Victor Stinnera47082312012-10-04 02:19:54 +020014743 if (ctx.args_owned) {
14744 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014745 }
Victor Stinnera47082312012-10-04 02:19:54 +020014746 Py_DECREF(ctx.fmtstr);
14747 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014748
Benjamin Peterson29060642009-01-31 22:14:21 +000014749 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014750 Py_DECREF(ctx.fmtstr);
14751 _PyUnicodeWriter_Dealloc(&ctx.writer);
14752 if (ctx.args_owned) {
14753 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014754 }
14755 return NULL;
14756}
14757
Jeremy Hylton938ace62002-07-17 16:30:39 +000014758static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014759unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14760
Tim Peters6d6c1a32001-08-02 04:15:00 +000014761static PyObject *
14762unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14763{
Benjamin Peterson29060642009-01-31 22:14:21 +000014764 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014765 static char *kwlist[] = {"object", "encoding", "errors", 0};
14766 char *encoding = NULL;
14767 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014768
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 if (type != &PyUnicode_Type)
14770 return unicode_subtype_new(type, args, kwds);
14771 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014772 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014773 return NULL;
14774 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014775 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014776 if (encoding == NULL && errors == NULL)
14777 return PyObject_Str(x);
14778 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014779 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014780}
14781
Guido van Rossume023fe02001-08-30 03:12:59 +000014782static PyObject *
14783unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14784{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014785 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014786 Py_ssize_t length, char_size;
14787 int share_wstr, share_utf8;
14788 unsigned int kind;
14789 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014790
Benjamin Peterson14339b62009-01-31 16:36:08 +000014791 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014792
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014793 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014794 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014795 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014796 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014797 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014798 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014799 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014800 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014801
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014802 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014803 if (self == NULL) {
14804 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014805 return NULL;
14806 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014807 kind = PyUnicode_KIND(unicode);
14808 length = PyUnicode_GET_LENGTH(unicode);
14809
14810 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014811#ifdef Py_DEBUG
14812 _PyUnicode_HASH(self) = -1;
14813#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014814 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014815#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014816 _PyUnicode_STATE(self).interned = 0;
14817 _PyUnicode_STATE(self).kind = kind;
14818 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014819 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014820 _PyUnicode_STATE(self).ready = 1;
14821 _PyUnicode_WSTR(self) = NULL;
14822 _PyUnicode_UTF8_LENGTH(self) = 0;
14823 _PyUnicode_UTF8(self) = NULL;
14824 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014825 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014826
14827 share_utf8 = 0;
14828 share_wstr = 0;
14829 if (kind == PyUnicode_1BYTE_KIND) {
14830 char_size = 1;
14831 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14832 share_utf8 = 1;
14833 }
14834 else if (kind == PyUnicode_2BYTE_KIND) {
14835 char_size = 2;
14836 if (sizeof(wchar_t) == 2)
14837 share_wstr = 1;
14838 }
14839 else {
14840 assert(kind == PyUnicode_4BYTE_KIND);
14841 char_size = 4;
14842 if (sizeof(wchar_t) == 4)
14843 share_wstr = 1;
14844 }
14845
14846 /* Ensure we won't overflow the length. */
14847 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14848 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014849 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014850 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014851 data = PyObject_MALLOC((length + 1) * char_size);
14852 if (data == NULL) {
14853 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014854 goto onError;
14855 }
14856
Victor Stinnerc3c74152011-10-02 20:39:55 +020014857 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014858 if (share_utf8) {
14859 _PyUnicode_UTF8_LENGTH(self) = length;
14860 _PyUnicode_UTF8(self) = data;
14861 }
14862 if (share_wstr) {
14863 _PyUnicode_WSTR_LENGTH(self) = length;
14864 _PyUnicode_WSTR(self) = (wchar_t *)data;
14865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014866
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014867 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014868 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014869 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014870#ifdef Py_DEBUG
14871 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14872#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014873 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014874 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014875
14876onError:
14877 Py_DECREF(unicode);
14878 Py_DECREF(self);
14879 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014880}
14881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014882PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014883"str(object='') -> str\n\
14884str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014885\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014886Create a new string object from the given object. If encoding or\n\
14887errors is specified, then the object must expose a data buffer\n\
14888that will be decoded using the given encoding and error handler.\n\
14889Otherwise, returns the result of object.__str__() (if defined)\n\
14890or repr(object).\n\
14891encoding defaults to sys.getdefaultencoding().\n\
14892errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014893
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014894static PyObject *unicode_iter(PyObject *seq);
14895
Guido van Rossumd57fd912000-03-10 22:53:23 +000014896PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014897 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014898 "str", /* tp_name */
14899 sizeof(PyUnicodeObject), /* tp_size */
14900 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014901 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014902 (destructor)unicode_dealloc, /* tp_dealloc */
14903 0, /* tp_print */
14904 0, /* tp_getattr */
14905 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014906 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014907 unicode_repr, /* tp_repr */
14908 &unicode_as_number, /* tp_as_number */
14909 &unicode_as_sequence, /* tp_as_sequence */
14910 &unicode_as_mapping, /* tp_as_mapping */
14911 (hashfunc) unicode_hash, /* tp_hash*/
14912 0, /* tp_call*/
14913 (reprfunc) unicode_str, /* tp_str */
14914 PyObject_GenericGetAttr, /* tp_getattro */
14915 0, /* tp_setattro */
14916 0, /* tp_as_buffer */
14917 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014918 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014919 unicode_doc, /* tp_doc */
14920 0, /* tp_traverse */
14921 0, /* tp_clear */
14922 PyUnicode_RichCompare, /* tp_richcompare */
14923 0, /* tp_weaklistoffset */
14924 unicode_iter, /* tp_iter */
14925 0, /* tp_iternext */
14926 unicode_methods, /* tp_methods */
14927 0, /* tp_members */
14928 0, /* tp_getset */
14929 &PyBaseObject_Type, /* tp_base */
14930 0, /* tp_dict */
14931 0, /* tp_descr_get */
14932 0, /* tp_descr_set */
14933 0, /* tp_dictoffset */
14934 0, /* tp_init */
14935 0, /* tp_alloc */
14936 unicode_new, /* tp_new */
14937 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938};
14939
14940/* Initialize the Unicode implementation */
14941
Victor Stinner3a50e702011-10-18 21:21:00 +020014942int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014944 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014945 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014946 0x000A, /* LINE FEED */
14947 0x000D, /* CARRIAGE RETURN */
14948 0x001C, /* FILE SEPARATOR */
14949 0x001D, /* GROUP SEPARATOR */
14950 0x001E, /* RECORD SEPARATOR */
14951 0x0085, /* NEXT LINE */
14952 0x2028, /* LINE SEPARATOR */
14953 0x2029, /* PARAGRAPH SEPARATOR */
14954 };
14955
Fred Drakee4315f52000-05-09 19:53:39 +000014956 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014957 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014958 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014959 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014960 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014961
Guido van Rossumcacfc072002-05-24 19:01:59 +000014962 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014963 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014964
14965 /* initialize the linebreak bloom filter */
14966 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014967 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014968 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014969
Christian Heimes26532f72013-07-20 14:57:16 +020014970 if (PyType_Ready(&EncodingMapType) < 0)
14971 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014972
Benjamin Petersonc4311282012-10-30 23:21:10 -040014973 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14974 Py_FatalError("Can't initialize field name iterator type");
14975
14976 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14977 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014978
Victor Stinner3a50e702011-10-18 21:21:00 +020014979 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980}
14981
14982/* Finalize the Unicode implementation */
14983
Christian Heimesa156e092008-02-16 07:38:31 +000014984int
14985PyUnicode_ClearFreeList(void)
14986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014987 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014988}
14989
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990void
Thomas Wouters78890102000-07-22 19:25:51 +000014991_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014992{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014993 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014994
Serhiy Storchaka05997252013-01-26 12:14:02 +020014995 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014996
Serhiy Storchaka05997252013-01-26 12:14:02 +020014997 for (i = 0; i < 256; i++)
14998 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014999 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015000 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015001}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015002
Walter Dörwald16807132007-05-25 13:52:07 +000015003void
15004PyUnicode_InternInPlace(PyObject **p)
15005{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015006 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015007 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015008#ifdef Py_DEBUG
15009 assert(s != NULL);
15010 assert(_PyUnicode_CHECK(s));
15011#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015012 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015013 return;
15014#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015015 /* If it's a subclass, we don't really know what putting
15016 it in the interned dict might do. */
15017 if (!PyUnicode_CheckExact(s))
15018 return;
15019 if (PyUnicode_CHECK_INTERNED(s))
15020 return;
15021 if (interned == NULL) {
15022 interned = PyDict_New();
15023 if (interned == NULL) {
15024 PyErr_Clear(); /* Don't leave an exception */
15025 return;
15026 }
15027 }
15028 /* It might be that the GetItem call fails even
15029 though the key is present in the dictionary,
15030 namely when this happens during a stack overflow. */
15031 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015032 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015033 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015034
Victor Stinnerf0335102013-04-14 19:13:03 +020015035 if (t) {
15036 Py_INCREF(t);
15037 Py_DECREF(*p);
15038 *p = t;
15039 return;
15040 }
Walter Dörwald16807132007-05-25 13:52:07 +000015041
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015043 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 PyErr_Clear();
15045 PyThreadState_GET()->recursion_critical = 0;
15046 return;
15047 }
15048 PyThreadState_GET()->recursion_critical = 0;
15049 /* The two references in interned are not counted by refcnt.
15050 The deallocator will take care of this */
15051 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015052 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015053}
15054
15055void
15056PyUnicode_InternImmortal(PyObject **p)
15057{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015058 PyUnicode_InternInPlace(p);
15059 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015060 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015061 Py_INCREF(*p);
15062 }
Walter Dörwald16807132007-05-25 13:52:07 +000015063}
15064
15065PyObject *
15066PyUnicode_InternFromString(const char *cp)
15067{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 PyObject *s = PyUnicode_FromString(cp);
15069 if (s == NULL)
15070 return NULL;
15071 PyUnicode_InternInPlace(&s);
15072 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015073}
15074
Alexander Belopolsky40018472011-02-26 01:02:56 +000015075void
15076_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015077{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015079 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 Py_ssize_t i, n;
15081 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015082
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 if (interned == NULL || !PyDict_Check(interned))
15084 return;
15085 keys = PyDict_Keys(interned);
15086 if (keys == NULL || !PyList_Check(keys)) {
15087 PyErr_Clear();
15088 return;
15089 }
Walter Dörwald16807132007-05-25 13:52:07 +000015090
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15092 detector, interned unicode strings are not forcibly deallocated;
15093 rather, we give them their stolen references back, and then clear
15094 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015095
Benjamin Peterson14339b62009-01-31 16:36:08 +000015096 n = PyList_GET_SIZE(keys);
15097 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015098 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015100 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015101 if (PyUnicode_READY(s) == -1) {
15102 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015103 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015105 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 case SSTATE_NOT_INTERNED:
15107 /* XXX Shouldn't happen */
15108 break;
15109 case SSTATE_INTERNED_IMMORTAL:
15110 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015111 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015112 break;
15113 case SSTATE_INTERNED_MORTAL:
15114 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015115 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 break;
15117 default:
15118 Py_FatalError("Inconsistent interned string state.");
15119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015120 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 }
15122 fprintf(stderr, "total size of all interned strings: "
15123 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15124 "mortal/immortal\n", mortal_size, immortal_size);
15125 Py_DECREF(keys);
15126 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015127 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015128}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015129
15130
15131/********************* Unicode Iterator **************************/
15132
15133typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 PyObject_HEAD
15135 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015136 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015137} unicodeiterobject;
15138
15139static void
15140unicodeiter_dealloc(unicodeiterobject *it)
15141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 _PyObject_GC_UNTRACK(it);
15143 Py_XDECREF(it->it_seq);
15144 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015145}
15146
15147static int
15148unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015150 Py_VISIT(it->it_seq);
15151 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015152}
15153
15154static PyObject *
15155unicodeiter_next(unicodeiterobject *it)
15156{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015157 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015158
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 assert(it != NULL);
15160 seq = it->it_seq;
15161 if (seq == NULL)
15162 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015163 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015165 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15166 int kind = PyUnicode_KIND(seq);
15167 void *data = PyUnicode_DATA(seq);
15168 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15169 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015170 if (item != NULL)
15171 ++it->it_index;
15172 return item;
15173 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015174
Benjamin Peterson14339b62009-01-31 16:36:08 +000015175 Py_DECREF(seq);
15176 it->it_seq = NULL;
15177 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015178}
15179
15180static PyObject *
15181unicodeiter_len(unicodeiterobject *it)
15182{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015183 Py_ssize_t len = 0;
15184 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015185 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015187}
15188
15189PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15190
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015191static PyObject *
15192unicodeiter_reduce(unicodeiterobject *it)
15193{
15194 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015195 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015196 it->it_seq, it->it_index);
15197 } else {
15198 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15199 if (u == NULL)
15200 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015201 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015202 }
15203}
15204
15205PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15206
15207static PyObject *
15208unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15209{
15210 Py_ssize_t index = PyLong_AsSsize_t(state);
15211 if (index == -1 && PyErr_Occurred())
15212 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015213 if (it->it_seq != NULL) {
15214 if (index < 0)
15215 index = 0;
15216 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15217 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15218 it->it_index = index;
15219 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015220 Py_RETURN_NONE;
15221}
15222
15223PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15224
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015225static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015226 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015227 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015228 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15229 reduce_doc},
15230 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15231 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015233};
15234
15235PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015236 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15237 "str_iterator", /* tp_name */
15238 sizeof(unicodeiterobject), /* tp_basicsize */
15239 0, /* tp_itemsize */
15240 /* methods */
15241 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15242 0, /* tp_print */
15243 0, /* tp_getattr */
15244 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015245 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 0, /* tp_repr */
15247 0, /* tp_as_number */
15248 0, /* tp_as_sequence */
15249 0, /* tp_as_mapping */
15250 0, /* tp_hash */
15251 0, /* tp_call */
15252 0, /* tp_str */
15253 PyObject_GenericGetAttr, /* tp_getattro */
15254 0, /* tp_setattro */
15255 0, /* tp_as_buffer */
15256 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15257 0, /* tp_doc */
15258 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15259 0, /* tp_clear */
15260 0, /* tp_richcompare */
15261 0, /* tp_weaklistoffset */
15262 PyObject_SelfIter, /* tp_iter */
15263 (iternextfunc)unicodeiter_next, /* tp_iternext */
15264 unicodeiter_methods, /* tp_methods */
15265 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015266};
15267
15268static PyObject *
15269unicode_iter(PyObject *seq)
15270{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015272
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 if (!PyUnicode_Check(seq)) {
15274 PyErr_BadInternalCall();
15275 return NULL;
15276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015277 if (PyUnicode_READY(seq) == -1)
15278 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15280 if (it == NULL)
15281 return NULL;
15282 it->it_index = 0;
15283 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015284 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015285 _PyObject_GC_TRACK(it);
15286 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015287}
15288
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015289
15290size_t
15291Py_UNICODE_strlen(const Py_UNICODE *u)
15292{
15293 int res = 0;
15294 while(*u++)
15295 res++;
15296 return res;
15297}
15298
15299Py_UNICODE*
15300Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15301{
15302 Py_UNICODE *u = s1;
15303 while ((*u++ = *s2++));
15304 return s1;
15305}
15306
15307Py_UNICODE*
15308Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15309{
15310 Py_UNICODE *u = s1;
15311 while ((*u++ = *s2++))
15312 if (n-- == 0)
15313 break;
15314 return s1;
15315}
15316
15317Py_UNICODE*
15318Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15319{
15320 Py_UNICODE *u1 = s1;
15321 u1 += Py_UNICODE_strlen(u1);
15322 Py_UNICODE_strcpy(u1, s2);
15323 return s1;
15324}
15325
15326int
15327Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15328{
15329 while (*s1 && *s2 && *s1 == *s2)
15330 s1++, s2++;
15331 if (*s1 && *s2)
15332 return (*s1 < *s2) ? -1 : +1;
15333 if (*s1)
15334 return 1;
15335 if (*s2)
15336 return -1;
15337 return 0;
15338}
15339
15340int
15341Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15342{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015343 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015344 for (; n != 0; n--) {
15345 u1 = *s1;
15346 u2 = *s2;
15347 if (u1 != u2)
15348 return (u1 < u2) ? -1 : +1;
15349 if (u1 == '\0')
15350 return 0;
15351 s1++;
15352 s2++;
15353 }
15354 return 0;
15355}
15356
15357Py_UNICODE*
15358Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15359{
15360 const Py_UNICODE *p;
15361 for (p = s; *p; p++)
15362 if (*p == c)
15363 return (Py_UNICODE*)p;
15364 return NULL;
15365}
15366
15367Py_UNICODE*
15368Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15369{
15370 const Py_UNICODE *p;
15371 p = s + Py_UNICODE_strlen(s);
15372 while (p != s) {
15373 p--;
15374 if (*p == c)
15375 return (Py_UNICODE*)p;
15376 }
15377 return NULL;
15378}
Victor Stinner331ea922010-08-10 16:37:20 +000015379
Victor Stinner71133ff2010-09-01 23:43:53 +000015380Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015381PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015382{
Victor Stinner577db2c2011-10-11 22:12:48 +020015383 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015384 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015386 if (!PyUnicode_Check(unicode)) {
15387 PyErr_BadArgument();
15388 return NULL;
15389 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015390 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015391 if (u == NULL)
15392 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015393 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015394 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015395 PyErr_NoMemory();
15396 return NULL;
15397 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015398 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015399 size *= sizeof(Py_UNICODE);
15400 copy = PyMem_Malloc(size);
15401 if (copy == NULL) {
15402 PyErr_NoMemory();
15403 return NULL;
15404 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015405 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015406 return copy;
15407}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015408
Georg Brandl66c221e2010-10-14 07:04:07 +000015409/* A _string module, to export formatter_parser and formatter_field_name_split
15410 to the string.Formatter class implemented in Python. */
15411
15412static PyMethodDef _string_methods[] = {
15413 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15414 METH_O, PyDoc_STR("split the argument as a field name")},
15415 {"formatter_parser", (PyCFunction) formatter_parser,
15416 METH_O, PyDoc_STR("parse the argument as a format string")},
15417 {NULL, NULL}
15418};
15419
15420static struct PyModuleDef _string_module = {
15421 PyModuleDef_HEAD_INIT,
15422 "_string",
15423 PyDoc_STR("string helper module"),
15424 0,
15425 _string_methods,
15426 NULL,
15427 NULL,
15428 NULL,
15429 NULL
15430};
15431
15432PyMODINIT_FUNC
15433PyInit__string(void)
15434{
15435 return PyModule_Create(&_string_module);
15436}
15437
15438
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015439#ifdef __cplusplus
15440}
15441#endif