blob: 3225fb3be9d41b358c3ee9e7b1792ceb8dd333d6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
678/* Fill the data of an Unicode string with invalid characters to detect bugs
679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Victor Stinner84def372011-12-11 20:04:56 +0100725 _Py_DEC_REFTOTAL;
726 _Py_ForgetReference(unicode);
727
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300728 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100729 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100730 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 PyErr_NoMemory();
732 return NULL;
733 }
Victor Stinner84def372011-12-11 20:04:56 +0100734 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100736
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100740 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_WSTR_LENGTH(unicode) = length;
742 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100743 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
744 PyObject_DEL(_PyUnicode_WSTR(unicode));
745 _PyUnicode_WSTR(unicode) = NULL;
746 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200747#ifdef Py_DEBUG
748 unicode_fill_invalid(unicode, old_length);
749#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
751 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200752 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753 return unicode;
754}
755
Alexander Belopolsky40018472011-02-26 01:02:56 +0000756static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200757resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
Victor Stinner95663112011-10-04 01:03:50 +0200759 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100760 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200761 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200762 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 if (PyUnicode_IS_READY(unicode)) {
765 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200766 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
770#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771
772 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200773 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200774 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
775 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776
777 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
778 PyErr_NoMemory();
779 return -1;
780 }
781 new_size = (length + 1) * char_size;
782
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
784 {
785 PyObject_DEL(_PyUnicode_UTF8(unicode));
786 _PyUnicode_UTF8(unicode) = NULL;
787 _PyUnicode_UTF8_LENGTH(unicode) = 0;
788 }
789
Victor Stinnerfe226c02011-10-03 03:52:20 +0200790 data = (PyObject *)PyObject_REALLOC(data, new_size);
791 if (data == NULL) {
792 PyErr_NoMemory();
793 return -1;
794 }
795 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200796 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 _PyUnicode_WSTR_LENGTH(unicode) = length;
799 }
800 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200801 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200802 _PyUnicode_UTF8_LENGTH(unicode) = length;
803 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_LENGTH(unicode) = length;
805 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200806#ifdef Py_DEBUG
807 unicode_fill_invalid(unicode, old_length);
808#endif
Victor Stinner95663112011-10-04 01:03:50 +0200809 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200810 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
Victor Stinner95663112011-10-04 01:03:50 +0200814 assert(_PyUnicode_WSTR(unicode) != NULL);
815
816 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700817 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200818 PyErr_NoMemory();
819 return -1;
820 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100821 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200822 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200824 if (!wstr) {
825 PyErr_NoMemory();
826 return -1;
827 }
828 _PyUnicode_WSTR(unicode) = wstr;
829 _PyUnicode_WSTR(unicode)[length] = 0;
830 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200831 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832 return 0;
833}
834
Victor Stinnerfe226c02011-10-03 03:52:20 +0200835static PyObject*
836resize_copy(PyObject *unicode, Py_ssize_t length)
837{
838 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100839 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200840 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841
Benjamin Petersonbac79492012-01-14 13:34:47 -0500842 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200844
845 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
846 if (copy == NULL)
847 return NULL;
848
849 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200850 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200852 }
853 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200854 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100855
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200857 if (w == NULL)
858 return NULL;
859 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
860 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200861 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
862 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 }
865}
866
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000868 Ux0000 terminated; some code (e.g. new_identifier)
869 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000872 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
874*/
875
Alexander Belopolsky40018472011-02-26 01:02:56 +0000876static PyUnicodeObject *
877_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200879 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 if (length == 0 && unicode_empty != NULL) {
884 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200885 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886 }
887
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000888 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700889 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 return (PyUnicodeObject *)PyErr_NoMemory();
891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 if (length < 0) {
893 PyErr_SetString(PyExc_SystemError,
894 "Negative size passed to _PyUnicode_New");
895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896 }
897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
899 if (unicode == NULL)
900 return NULL;
901 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100902
903 _PyUnicode_WSTR_LENGTH(unicode) = length;
904 _PyUnicode_HASH(unicode) = -1;
905 _PyUnicode_STATE(unicode).interned = 0;
906 _PyUnicode_STATE(unicode).kind = 0;
907 _PyUnicode_STATE(unicode).compact = 0;
908 _PyUnicode_STATE(unicode).ready = 0;
909 _PyUnicode_STATE(unicode).ascii = 0;
910 _PyUnicode_DATA_ANY(unicode) = NULL;
911 _PyUnicode_LENGTH(unicode) = 0;
912 _PyUnicode_UTF8(unicode) = NULL;
913 _PyUnicode_UTF8_LENGTH(unicode) = 0;
914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
916 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921
Jeremy Hyltond8082792003-09-16 19:41:39 +0000922 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000923 * the caller fails before initializing str -- unicode_resize()
924 * reads str[0], and the Keep-Alive optimization can keep memory
925 * allocated for str alive across a call to unicode_dealloc(unicode).
926 * We don't want unicode_resize to read uninitialized memory in
927 * that case.
928 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 _PyUnicode_WSTR(unicode)[0] = 0;
930 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100931
Victor Stinner7931d9a2011-11-04 00:22:48 +0100932 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933 return unicode;
934}
935
Victor Stinnerf42dc442011-10-02 23:33:16 +0200936static const char*
937unicode_kind_name(PyObject *unicode)
938{
Victor Stinner42dfd712011-10-03 14:41:45 +0200939 /* don't check consistency: unicode_kind_name() is called from
940 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200941 if (!PyUnicode_IS_COMPACT(unicode))
942 {
943 if (!PyUnicode_IS_READY(unicode))
944 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600945 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 {
947 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200948 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200949 return "legacy ascii";
950 else
951 return "legacy latin1";
952 case PyUnicode_2BYTE_KIND:
953 return "legacy UCS2";
954 case PyUnicode_4BYTE_KIND:
955 return "legacy UCS4";
956 default:
957 return "<legacy invalid kind>";
958 }
959 }
960 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600961 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 return "ascii";
965 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200966 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200967 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 default:
972 return "<invalid compact kind>";
973 }
974}
975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977/* Functions wrapping macros for use in debugger */
978char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200979 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980}
981
982void *_PyUnicode_compact_data(void *unicode) {
983 return _PyUnicode_COMPACT_DATA(unicode);
984}
985void *_PyUnicode_data(void *unicode){
986 printf("obj %p\n", unicode);
987 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
988 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
989 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
990 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
991 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
992 return PyUnicode_DATA(unicode);
993}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200994
995void
996_PyUnicode_Dump(PyObject *op)
997{
998 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200999 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1000 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1001 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001002
Victor Stinnera849a4b2011-10-03 12:12:11 +02001003 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001004 {
1005 if (ascii->state.ascii)
1006 data = (ascii + 1);
1007 else
1008 data = (compact + 1);
1009 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 else
1011 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001012 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1013 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 if (ascii->wstr == data)
1016 printf("shared ");
1017 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001018
Victor Stinnera3b334d2011-10-03 13:53:37 +02001019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001020 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001023 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1024 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001536 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1537 PyErr_NoMemory();
1538 return -1;
1539 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001540 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1541 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyErr_NoMemory();
1543 return -1;
1544 }
1545 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1546 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001547 _PyUnicode_UTF8(unicode) = NULL;
1548 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001549 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1550 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001551 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552 PyObject_FREE(_PyUnicode_WSTR(unicode));
1553 _PyUnicode_WSTR(unicode) = NULL;
1554 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1555#else
1556 assert(num_surrogates == 0);
1557
Victor Stinnerc3c74152011-10-02 20:39:55 +02001558 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001560 _PyUnicode_UTF8(unicode) = NULL;
1561 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1563#endif
1564 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1565 }
1566 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001567 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 return 0;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001572unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573{
Walter Dörwald16807132007-05-25 13:52:07 +00001574 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 case SSTATE_NOT_INTERNED:
1576 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 case SSTATE_INTERNED_MORTAL:
1579 /* revive dead object temporarily for DelItem */
1580 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001581 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 Py_FatalError(
1583 "deletion of interned string failed");
1584 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001585
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 case SSTATE_INTERNED_IMMORTAL:
1587 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001588
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 default:
1590 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001591 }
1592
Victor Stinner03490912011-10-03 23:45:12 +02001593 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001595 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001596 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001597 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1598 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001600 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601}
1602
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001603#ifdef Py_DEBUG
1604static int
1605unicode_is_singleton(PyObject *unicode)
1606{
1607 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1608 if (unicode == unicode_empty)
1609 return 1;
1610 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1611 {
1612 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1613 if (ch < 256 && unicode_latin1[ch] == unicode)
1614 return 1;
1615 }
1616 return 0;
1617}
1618#endif
1619
Alexander Belopolsky40018472011-02-26 01:02:56 +00001620static int
Victor Stinner488fa492011-12-12 00:01:39 +01001621unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001622{
Victor Stinner488fa492011-12-12 00:01:39 +01001623 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (Py_REFCNT(unicode) != 1)
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (_PyUnicode_HASH(unicode) != -1)
1627 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_CHECK_INTERNED(unicode))
1629 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001630 if (!PyUnicode_CheckExact(unicode))
1631 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001632#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001633 /* singleton refcount is greater than 1 */
1634 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001635#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636 return 1;
1637}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001638
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639static int
1640unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1641{
1642 PyObject *unicode;
1643 Py_ssize_t old_length;
1644
1645 assert(p_unicode != NULL);
1646 unicode = *p_unicode;
1647
1648 assert(unicode != NULL);
1649 assert(PyUnicode_Check(unicode));
1650 assert(0 <= length);
1651
Victor Stinner910337b2011-10-03 03:20:16 +02001652 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001653 old_length = PyUnicode_WSTR_LENGTH(unicode);
1654 else
1655 old_length = PyUnicode_GET_LENGTH(unicode);
1656 if (old_length == length)
1657 return 0;
1658
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001660 _Py_INCREF_UNICODE_EMPTY();
1661 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 Py_DECREF(*p_unicode);
1664 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001665 return 0;
1666 }
1667
Victor Stinner488fa492011-12-12 00:01:39 +01001668 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001669 PyObject *copy = resize_copy(unicode, length);
1670 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001671 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 Py_DECREF(*p_unicode);
1673 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001674 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001675 }
1676
Victor Stinnerfe226c02011-10-03 03:52:20 +02001677 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001678 PyObject *new_unicode = resize_compact(unicode, length);
1679 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001681 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001683 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001684 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001685}
1686
Alexander Belopolsky40018472011-02-26 01:02:56 +00001687int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001689{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001690 PyObject *unicode;
1691 if (p_unicode == NULL) {
1692 PyErr_BadInternalCall();
1693 return -1;
1694 }
1695 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001696 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001697 {
1698 PyErr_BadInternalCall();
1699 return -1;
1700 }
1701 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001702}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001703
Victor Stinnerc5166102012-02-22 13:55:02 +01001704/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001705
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001706 WARNING: The function doesn't copy the terminating null character and
1707 doesn't check the maximum character (may write a latin1 character in an
1708 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001709static void
1710unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1711 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001712{
1713 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1714 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001715 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001716
1717 switch (kind) {
1718 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001719 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001720#ifdef Py_DEBUG
1721 if (PyUnicode_IS_ASCII(unicode)) {
1722 Py_UCS4 maxchar = ucs1lib_find_max_char(
1723 (const Py_UCS1*)str,
1724 (const Py_UCS1*)str + len);
1725 assert(maxchar < 128);
1726 }
1727#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001728 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001729 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001730 }
1731 case PyUnicode_2BYTE_KIND: {
1732 Py_UCS2 *start = (Py_UCS2 *)data + index;
1733 Py_UCS2 *ucs2 = start;
1734 assert(index <= PyUnicode_GET_LENGTH(unicode));
1735
Victor Stinner184252a2012-06-16 02:57:41 +02001736 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 *ucs2 = (Py_UCS2)*str;
1738
1739 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001740 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001741 }
1742 default: {
1743 Py_UCS4 *start = (Py_UCS4 *)data + index;
1744 Py_UCS4 *ucs4 = start;
1745 assert(kind == PyUnicode_4BYTE_KIND);
1746 assert(index <= PyUnicode_GET_LENGTH(unicode));
1747
Victor Stinner184252a2012-06-16 02:57:41 +02001748 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001749 *ucs4 = (Py_UCS4)*str;
1750
1751 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001752 }
1753 }
1754}
1755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756static PyObject*
1757get_latin1_char(unsigned char ch)
1758{
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 if (!unicode)
1763 return NULL;
1764 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001765 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 unicode_latin1[ch] = unicode;
1767 }
1768 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001769 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770}
1771
Victor Stinner985a82a2014-01-03 12:53:47 +01001772static PyObject*
1773unicode_char(Py_UCS4 ch)
1774{
1775 PyObject *unicode;
1776
1777 assert(ch <= MAX_UNICODE);
1778
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001779 if (ch < 256)
1780 return get_latin1_char(ch);
1781
Victor Stinner985a82a2014-01-03 12:53:47 +01001782 unicode = PyUnicode_New(1, ch);
1783 if (unicode == NULL)
1784 return NULL;
1785 switch (PyUnicode_KIND(unicode)) {
1786 case PyUnicode_1BYTE_KIND:
1787 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1788 break;
1789 case PyUnicode_2BYTE_KIND:
1790 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1791 break;
1792 default:
1793 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1794 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1795 }
1796 assert(_PyUnicode_CheckConsistency(unicode, 1));
1797 return unicode;
1798}
1799
Alexander Belopolsky40018472011-02-26 01:02:56 +00001800PyObject *
1801PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001803 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 Py_UCS4 maxchar = 0;
1805 Py_ssize_t num_surrogates;
1806
1807 if (u == NULL)
1808 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810 /* If the Unicode data is known at construction time, we can apply
1811 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001814 if (size == 0)
1815 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 /* Single character Unicode objects in the Latin-1 range are
1818 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001819 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 return get_latin1_char((unsigned char)*u);
1821
1822 /* If not empty and not single character, copy the Unicode data
1823 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001824 if (find_maxchar_surrogates(u, u + size,
1825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return NULL;
1827
Victor Stinner8faf8212011-12-08 22:14:11 +01001828 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 if (!unicode)
1830 return NULL;
1831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 switch (PyUnicode_KIND(unicode)) {
1833 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1836 break;
1837 case PyUnicode_2BYTE_KIND:
1838#if Py_UNICODE_SIZE == 2
1839 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1840#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001841 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1843#endif
1844 break;
1845 case PyUnicode_4BYTE_KIND:
1846#if SIZEOF_WCHAR_T == 2
1847 /* This is the only case which has to process surrogates, thus
1848 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001849 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850#else
1851 assert(num_surrogates == 0);
1852 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1853#endif
1854 break;
1855 default:
1856 assert(0 && "Impossible state");
1857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001859 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860}
1861
Alexander Belopolsky40018472011-02-26 01:02:56 +00001862PyObject *
1863PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001864{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 if (size < 0) {
1866 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 return NULL;
1869 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001870 if (u != NULL)
1871 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1872 else
1873 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001874}
1875
Alexander Belopolsky40018472011-02-26 01:02:56 +00001876PyObject *
1877PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001878{
1879 size_t size = strlen(u);
1880 if (size > PY_SSIZE_T_MAX) {
1881 PyErr_SetString(PyExc_OverflowError, "input too long");
1882 return NULL;
1883 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001884 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001885}
1886
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001887PyObject *
1888_PyUnicode_FromId(_Py_Identifier *id)
1889{
1890 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001891 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1892 strlen(id->string),
1893 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001894 if (!id->object)
1895 return NULL;
1896 PyUnicode_InternInPlace(&id->object);
1897 assert(!id->next);
1898 id->next = static_strings;
1899 static_strings = id;
1900 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001901 return id->object;
1902}
1903
1904void
1905_PyUnicode_ClearStaticStrings()
1906{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001907 _Py_Identifier *tmp, *s = static_strings;
1908 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001909 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 tmp = s->next;
1911 s->next = NULL;
1912 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001914 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001915}
1916
Benjamin Peterson0df54292012-03-26 14:50:32 -04001917/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918
Victor Stinnerd3f08822012-05-29 12:57:52 +02001919PyObject*
1920_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001921{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001922 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001923 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001924 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001926 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001927#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001928 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001929 }
Victor Stinner785938e2011-12-11 20:09:03 +01001930 unicode = PyUnicode_New(size, 127);
1931 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001932 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001933 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1934 assert(_PyUnicode_CheckConsistency(unicode, 1));
1935 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001936}
1937
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938static Py_UCS4
1939kind_maxchar_limit(unsigned int kind)
1940{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001941 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001942 case PyUnicode_1BYTE_KIND:
1943 return 0x80;
1944 case PyUnicode_2BYTE_KIND:
1945 return 0x100;
1946 case PyUnicode_4BYTE_KIND:
1947 return 0x10000;
1948 default:
1949 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001950 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001951 }
1952}
1953
Victor Stinnere6abb482012-05-02 01:15:40 +02001954Py_LOCAL_INLINE(Py_UCS4)
1955align_maxchar(Py_UCS4 maxchar)
1956{
1957 if (maxchar <= 127)
1958 return 127;
1959 else if (maxchar <= 255)
1960 return 255;
1961 else if (maxchar <= 65535)
1962 return 65535;
1963 else
1964 return MAX_UNICODE;
1965}
1966
Victor Stinner702c7342011-10-05 13:50:52 +02001967static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001968_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001972
Serhiy Storchaka678db842013-01-26 12:16:36 +02001973 if (size == 0)
1974 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001976 if (size == 1)
1977 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
1983 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001984 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001986}
1987
Victor Stinnere57b1c02011-09-28 22:20:48 +02001988static PyObject*
1989_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990{
1991 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001993
Serhiy Storchaka678db842013-01-26 12:16:36 +02001994 if (size == 0)
1995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001996 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 if (size == 1)
1998 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002000 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 if (!res)
2003 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002004 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002006 else {
2007 _PyUnicode_CONVERT_BYTES(
2008 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2009 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002010 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 return res;
2012}
2013
Victor Stinnere57b1c02011-09-28 22:20:48 +02002014static PyObject*
2015_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016{
2017 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002019
Serhiy Storchaka678db842013-01-26 12:16:36 +02002020 if (size == 0)
2021 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 if (size == 1)
2024 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002025
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002026 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002027 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 if (!res)
2029 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002030 if (max_char < 256)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2032 PyUnicode_1BYTE_DATA(res));
2033 else if (max_char < 0x10000)
2034 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2035 PyUnicode_2BYTE_DATA(res));
2036 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002038 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 return res;
2040}
2041
2042PyObject*
2043PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2044{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002045 if (size < 0) {
2046 PyErr_SetString(PyExc_ValueError, "size must be positive");
2047 return NULL;
2048 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002049 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002055 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002056 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002057 PyErr_SetString(PyExc_SystemError, "invalid kind");
2058 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060}
2061
Victor Stinnerece58de2012-04-23 23:36:38 +02002062Py_UCS4
2063_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2064{
2065 enum PyUnicode_Kind kind;
2066 void *startptr, *endptr;
2067
2068 assert(PyUnicode_IS_READY(unicode));
2069 assert(0 <= start);
2070 assert(end <= PyUnicode_GET_LENGTH(unicode));
2071 assert(start <= end);
2072
2073 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2074 return PyUnicode_MAX_CHAR_VALUE(unicode);
2075
2076 if (start == end)
2077 return 127;
2078
Victor Stinner94d558b2012-04-27 22:26:58 +02002079 if (PyUnicode_IS_ASCII(unicode))
2080 return 127;
2081
Victor Stinnerece58de2012-04-23 23:36:38 +02002082 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002083 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002084 endptr = (char *)startptr + end * kind;
2085 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002086 switch(kind) {
2087 case PyUnicode_1BYTE_KIND:
2088 return ucs1lib_find_max_char(startptr, endptr);
2089 case PyUnicode_2BYTE_KIND:
2090 return ucs2lib_find_max_char(startptr, endptr);
2091 case PyUnicode_4BYTE_KIND:
2092 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002094 assert(0);
2095 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002096 }
2097}
2098
Victor Stinner25a4b292011-10-06 12:31:55 +02002099/* Ensure that a string uses the most efficient storage, if it is not the
2100 case: create a new string with of the right kind. Write NULL into *p_unicode
2101 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002102static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002103unicode_adjust_maxchar(PyObject **p_unicode)
2104{
2105 PyObject *unicode, *copy;
2106 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002107 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002108 unsigned int kind;
2109
2110 assert(p_unicode != NULL);
2111 unicode = *p_unicode;
2112 assert(PyUnicode_IS_READY(unicode));
2113 if (PyUnicode_IS_ASCII(unicode))
2114 return;
2115
2116 len = PyUnicode_GET_LENGTH(unicode);
2117 kind = PyUnicode_KIND(unicode);
2118 if (kind == PyUnicode_1BYTE_KIND) {
2119 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002120 max_char = ucs1lib_find_max_char(u, u + len);
2121 if (max_char >= 128)
2122 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002123 }
2124 else if (kind == PyUnicode_2BYTE_KIND) {
2125 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs2lib_find_max_char(u, u + len);
2127 if (max_char >= 256)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
2130 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002133 max_char = ucs4lib_find_max_char(u, u + len);
2134 if (max_char >= 0x10000)
2135 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002138 if (copy != NULL)
2139 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002140 Py_DECREF(unicode);
2141 *p_unicode = copy;
2142}
2143
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002145_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146{
Victor Stinner87af4f22011-11-21 23:03:47 +01002147 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner034f6cf2011-09-30 02:26:44 +02002150 if (!PyUnicode_Check(unicode)) {
2151 PyErr_BadInternalCall();
2152 return NULL;
2153 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002154 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156
Victor Stinner87af4f22011-11-21 23:03:47 +01002157 length = PyUnicode_GET_LENGTH(unicode);
2158 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 if (!copy)
2160 return NULL;
2161 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2162
Victor Stinner87af4f22011-11-21 23:03:47 +01002163 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2164 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002165 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002166 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002167}
2168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
Victor Stinnerbc603d12011-10-02 01:00:40 +02002170/* Widen Unicode objects to larger buffers. Don't write terminating null
2171 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172
2173void*
2174_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2175{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002176 Py_ssize_t len;
2177 void *result;
2178 unsigned int skind;
2179
Benjamin Petersonbac79492012-01-14 13:34:47 -05002180 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 return NULL;
2182
2183 len = PyUnicode_GET_LENGTH(s);
2184 skind = PyUnicode_KIND(s);
2185 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002186 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 return NULL;
2188 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002189 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002190 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002191 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002192 if (!result)
2193 return PyErr_NoMemory();
2194 assert(skind == PyUnicode_1BYTE_KIND);
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS1, Py_UCS2,
2197 PyUnicode_1BYTE_DATA(s),
2198 PyUnicode_1BYTE_DATA(s) + len,
2199 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002201 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002202 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 if (!result)
2204 return PyErr_NoMemory();
2205 if (skind == PyUnicode_2BYTE_KIND) {
2206 _PyUnicode_CONVERT_BYTES(
2207 Py_UCS2, Py_UCS4,
2208 PyUnicode_2BYTE_DATA(s),
2209 PyUnicode_2BYTE_DATA(s) + len,
2210 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002212 else {
2213 assert(skind == PyUnicode_1BYTE_KIND);
2214 _PyUnicode_CONVERT_BYTES(
2215 Py_UCS1, Py_UCS4,
2216 PyUnicode_1BYTE_DATA(s),
2217 PyUnicode_1BYTE_DATA(s) + len,
2218 result);
2219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002221 default:
2222 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Victor Stinner01698042011-10-04 00:04:26 +02002224 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return NULL;
2226}
2227
2228static Py_UCS4*
2229as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2230 int copy_null)
2231{
2232 int kind;
2233 void *data;
2234 Py_ssize_t len, targetlen;
2235 if (PyUnicode_READY(string) == -1)
2236 return NULL;
2237 kind = PyUnicode_KIND(string);
2238 data = PyUnicode_DATA(string);
2239 len = PyUnicode_GET_LENGTH(string);
2240 targetlen = len;
2241 if (copy_null)
2242 targetlen++;
2243 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002244 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Victor Stinner15a11362012-10-06 23:48:20 +02002314/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002315 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2316 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2317#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002318
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002319static int
2320unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2321 Py_ssize_t width, Py_ssize_t precision)
2322{
2323 Py_ssize_t length, fill, arglen;
2324 Py_UCS4 maxchar;
2325
2326 if (PyUnicode_READY(str) == -1)
2327 return -1;
2328
2329 length = PyUnicode_GET_LENGTH(str);
2330 if ((precision == -1 || precision >= length)
2331 && width <= length)
2332 return _PyUnicodeWriter_WriteStr(writer, str);
2333
2334 if (precision != -1)
2335 length = Py_MIN(precision, length);
2336
2337 arglen = Py_MAX(length, width);
2338 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2339 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2340 else
2341 maxchar = writer->maxchar;
2342
2343 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2344 return -1;
2345
2346 if (width > length) {
2347 fill = width - length;
2348 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2349 return -1;
2350 writer->pos += fill;
2351 }
2352
2353 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2354 str, 0, length);
2355 writer->pos += length;
2356 return 0;
2357}
2358
2359static int
2360unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2361 Py_ssize_t width, Py_ssize_t precision)
2362{
2363 /* UTF-8 */
2364 Py_ssize_t length;
2365 PyObject *unicode;
2366 int res;
2367
2368 length = strlen(str);
2369 if (precision != -1)
2370 length = Py_MIN(length, precision);
2371 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2372 if (unicode == NULL)
2373 return -1;
2374
2375 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2376 Py_DECREF(unicode);
2377 return res;
2378}
2379
Victor Stinner96865452011-03-01 23:44:09 +00002380static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002381unicode_fromformat_arg(_PyUnicodeWriter *writer,
2382 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002383{
Victor Stinnere215d962012-10-06 23:03:36 +02002384 const char *p;
2385 Py_ssize_t len;
2386 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002387 Py_ssize_t width;
2388 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002389 int longflag;
2390 int longlongflag;
2391 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002392 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002393
2394 p = f;
2395 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002396 zeropad = 0;
2397 if (*f == '0') {
2398 zeropad = 1;
2399 f++;
2400 }
Victor Stinner96865452011-03-01 23:44:09 +00002401
2402 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002403 width = -1;
2404 if (Py_ISDIGIT((unsigned)*f)) {
2405 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002406 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002407 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002408 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002409 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002411 return NULL;
2412 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002414 f++;
2415 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 }
2417 precision = -1;
2418 if (*f == '.') {
2419 f++;
2420 if (Py_ISDIGIT((unsigned)*f)) {
2421 precision = (*f - '0');
2422 f++;
2423 while (Py_ISDIGIT((unsigned)*f)) {
2424 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2425 PyErr_SetString(PyExc_ValueError,
2426 "precision too big");
2427 return NULL;
2428 }
2429 precision = (precision * 10) + (*f - '0');
2430 f++;
2431 }
2432 }
Victor Stinner96865452011-03-01 23:44:09 +00002433 if (*f == '%') {
2434 /* "%.3%s" => f points to "3" */
2435 f--;
2436 }
2437 }
2438 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002439 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002440 f--;
2441 }
Victor Stinner96865452011-03-01 23:44:09 +00002442
2443 /* Handle %ld, %lu, %lld and %llu. */
2444 longflag = 0;
2445 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002446 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002447 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002448 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002449 longflag = 1;
2450 ++f;
2451 }
2452#ifdef HAVE_LONG_LONG
2453 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002454 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002455 longlongflag = 1;
2456 f += 2;
2457 }
2458#endif
2459 }
2460 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002461 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002462 size_tflag = 1;
2463 ++f;
2464 }
Victor Stinnere215d962012-10-06 23:03:36 +02002465
2466 if (f[1] == '\0')
2467 writer->overallocate = 0;
2468
2469 switch (*f) {
2470 case 'c':
2471 {
2472 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002473 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002474 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002475 "character argument not in range(0x110000)");
2476 return NULL;
2477 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002478 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002479 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002480 break;
2481 }
2482
2483 case 'i':
2484 case 'd':
2485 case 'u':
2486 case 'x':
2487 {
2488 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002489 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002490 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002493 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002494 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002495 va_arg(*vargs, unsigned long));
2496#ifdef HAVE_LONG_LONG
2497 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002498 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002499 va_arg(*vargs, unsigned PY_LONG_LONG));
2500#endif
2501 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002502 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002503 va_arg(*vargs, size_t));
2504 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002505 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002506 va_arg(*vargs, unsigned int));
2507 }
2508 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002509 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002510 }
2511 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002512 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002513 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002514 va_arg(*vargs, long));
2515#ifdef HAVE_LONG_LONG
2516 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002517 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002518 va_arg(*vargs, PY_LONG_LONG));
2519#endif
2520 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002521 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002522 va_arg(*vargs, Py_ssize_t));
2523 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002524 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002525 va_arg(*vargs, int));
2526 }
2527 assert(len >= 0);
2528
Victor Stinnere215d962012-10-06 23:03:36 +02002529 if (precision < len)
2530 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002531
2532 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2534 return NULL;
2535
Victor Stinnere215d962012-10-06 23:03:36 +02002536 if (width > precision) {
2537 Py_UCS4 fillchar;
2538 fill = width - precision;
2539 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2541 return NULL;
2542 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002543 }
Victor Stinner15a11362012-10-06 23:48:20 +02002544 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002545 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002546 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2547 return NULL;
2548 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002549 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002550
Victor Stinner4a587072013-11-19 12:54:53 +01002551 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2552 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002553 break;
2554 }
2555
2556 case 'p':
2557 {
2558 char number[MAX_LONG_LONG_CHARS];
2559
2560 len = sprintf(number, "%p", va_arg(*vargs, void*));
2561 assert(len >= 0);
2562
2563 /* %p is ill-defined: ensure leading 0x. */
2564 if (number[1] == 'X')
2565 number[1] = 'x';
2566 else if (number[1] != 'x') {
2567 memmove(number + 2, number,
2568 strlen(number) + 1);
2569 number[0] = '0';
2570 number[1] = 'x';
2571 len += 2;
2572 }
2573
Victor Stinner4a587072013-11-19 12:54:53 +01002574 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002575 return NULL;
2576 break;
2577 }
2578
2579 case 's':
2580 {
2581 /* UTF-8 */
2582 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002584 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002585 break;
2586 }
2587
2588 case 'U':
2589 {
2590 PyObject *obj = va_arg(*vargs, PyObject *);
2591 assert(obj && _PyUnicode_CHECK(obj));
2592
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002594 return NULL;
2595 break;
2596 }
2597
2598 case 'V':
2599 {
2600 PyObject *obj = va_arg(*vargs, PyObject *);
2601 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002602 if (obj) {
2603 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002605 return NULL;
2606 }
2607 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 assert(str != NULL);
2609 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002611 }
2612 break;
2613 }
2614
2615 case 'S':
2616 {
2617 PyObject *obj = va_arg(*vargs, PyObject *);
2618 PyObject *str;
2619 assert(obj);
2620 str = PyObject_Str(obj);
2621 if (!str)
2622 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002623 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002624 Py_DECREF(str);
2625 return NULL;
2626 }
2627 Py_DECREF(str);
2628 break;
2629 }
2630
2631 case 'R':
2632 {
2633 PyObject *obj = va_arg(*vargs, PyObject *);
2634 PyObject *repr;
2635 assert(obj);
2636 repr = PyObject_Repr(obj);
2637 if (!repr)
2638 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002640 Py_DECREF(repr);
2641 return NULL;
2642 }
2643 Py_DECREF(repr);
2644 break;
2645 }
2646
2647 case 'A':
2648 {
2649 PyObject *obj = va_arg(*vargs, PyObject *);
2650 PyObject *ascii;
2651 assert(obj);
2652 ascii = PyObject_ASCII(obj);
2653 if (!ascii)
2654 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002655 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002656 Py_DECREF(ascii);
2657 return NULL;
2658 }
2659 Py_DECREF(ascii);
2660 break;
2661 }
2662
2663 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667
2668 default:
2669 /* if we stumble upon an unknown formatting code, copy the rest
2670 of the format string to the output string. (we cannot just
2671 skip the code, since there's no way to know what's in the
2672 argument list) */
2673 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002674 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002675 return NULL;
2676 f = p+len;
2677 return f;
2678 }
2679
2680 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002681 return f;
2682}
2683
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684PyObject *
2685PyUnicode_FromFormatV(const char *format, va_list vargs)
2686{
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_list vargs2;
2688 const char *f;
2689 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002690
Victor Stinner8f674cc2013-04-17 23:02:17 +02002691 _PyUnicodeWriter_Init(&writer);
2692 writer.min_length = strlen(format) + 100;
2693 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2696 Copy it to be able to pass a reference to a subfunction. */
2697 Py_VA_COPY(vargs2, vargs);
2698
2699 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002701 f = unicode_fromformat_arg(&writer, f, &vargs2);
2702 if (f == NULL)
2703 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002705 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002706 const char *p;
2707 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
Victor Stinnere215d962012-10-06 23:03:36 +02002709 p = f;
2710 do
2711 {
2712 if ((unsigned char)*p > 127) {
2713 PyErr_Format(PyExc_ValueError,
2714 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2715 "string, got a non-ASCII byte: 0x%02x",
2716 (unsigned char)*p);
2717 return NULL;
2718 }
2719 p++;
2720 }
2721 while (*p != '\0' && *p != '%');
2722 len = p - f;
2723
2724 if (*p == '\0')
2725 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002726
2727 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002728 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002732 }
Victor Stinnere215d962012-10-06 23:03:36 +02002733 return _PyUnicodeWriter_Finish(&writer);
2734
2735 fail:
2736 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002738}
2739
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740PyObject *
2741PyUnicode_FromFormat(const char *format, ...)
2742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 PyObject* ret;
2744 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745
2746#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002748#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 ret = PyUnicode_FromFormatV(format, vargs);
2752 va_end(vargs);
2753 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002754}
2755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756#ifdef HAVE_WCHAR_H
2757
Victor Stinner5593d8a2010-10-02 11:11:27 +00002758/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2759 convert a Unicode object to a wide character string.
2760
Victor Stinnerd88d9832011-09-06 02:00:05 +02002761 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002762 character) required to convert the unicode object. Ignore size argument.
2763
Victor Stinnerd88d9832011-09-06 02:00:05 +02002764 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002765 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002768unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002769 wchar_t *w,
2770 Py_ssize_t size)
2771{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002772 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 const wchar_t *wstr;
2774
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002775 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 if (wstr == NULL)
2777 return -1;
2778
Victor Stinner5593d8a2010-10-02 11:11:27 +00002779 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 if (size > res)
2781 size = res + 1;
2782 else
2783 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 return res;
2786 }
2787 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002789}
2790
2791Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002792PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002793 wchar_t *w,
2794 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795{
2796 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 PyErr_BadInternalCall();
2798 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002800 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801}
2802
Victor Stinner137c34c2010-09-29 10:25:54 +00002803wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002804PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002805 Py_ssize_t *size)
2806{
2807 wchar_t* buffer;
2808 Py_ssize_t buflen;
2809
2810 if (unicode == NULL) {
2811 PyErr_BadInternalCall();
2812 return NULL;
2813 }
2814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002815 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 if (buflen == -1)
2817 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002818 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002819 if (buffer == NULL) {
2820 PyErr_NoMemory();
2821 return NULL;
2822 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002823 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002824 if (buflen == -1) {
2825 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002826 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002827 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 if (size != NULL)
2829 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 return buffer;
2831}
2832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
Alexander Belopolsky40018472011-02-26 01:02:56 +00002835PyObject *
2836PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002837{
Victor Stinner8faf8212011-12-08 22:14:11 +01002838 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 PyErr_SetString(PyExc_ValueError,
2840 "chr() arg not in range(0x110000)");
2841 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002843
Victor Stinner985a82a2014-01-03 12:53:47 +01002844 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002845}
2846
Alexander Belopolsky40018472011-02-26 01:02:56 +00002847PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002848PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002850 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002852 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002853 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002854 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 Py_INCREF(obj);
2856 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 }
2858 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 /* For a Unicode subtype that's not a Unicode object,
2860 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002861 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002862 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002863 PyErr_Format(PyExc_TypeError,
2864 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002865 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002866 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002867}
2868
Alexander Belopolsky40018472011-02-26 01:02:56 +00002869PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002870PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002871 const char *encoding,
2872 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002873{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002875 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002876
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 PyErr_BadInternalCall();
2879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002881
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002882 /* Decoding bytes objects is the most common case and should be fast */
2883 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002884 if (PyBytes_GET_SIZE(obj) == 0)
2885 _Py_RETURN_UNICODE_EMPTY();
2886 v = PyUnicode_Decode(
2887 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2888 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 return v;
2890 }
2891
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyErr_SetString(PyExc_TypeError,
2894 "decoding str is not supported");
2895 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002898 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2899 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2900 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002901 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002902 Py_TYPE(obj)->tp_name);
2903 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002904 }
Tim Petersced69f82003-09-16 20:30:58 +00002905
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002906 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002907 PyBuffer_Release(&buffer);
2908 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002910
Serhiy Storchaka05997252013-01-26 12:14:02 +02002911 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002912 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914}
2915
Victor Stinner600d3be2010-06-10 12:00:55 +00002916/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002917 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2918 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002919int
2920_Py_normalize_encoding(const char *encoding,
2921 char *lower,
2922 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002924 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002925 char *l;
2926 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002928 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002929 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002930 if (lower_len < 6)
2931 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002932 strcpy(lower, "utf-8");
2933 return 1;
2934 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002935 e = encoding;
2936 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002937 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002938 while (*e) {
2939 if (l == l_end)
2940 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002941 if (Py_ISUPPER(*e)) {
2942 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002943 }
2944 else if (*e == '_') {
2945 *l++ = '-';
2946 e++;
2947 }
2948 else {
2949 *l++ = *e++;
2950 }
2951 }
2952 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002953 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 Py_ssize_t size,
2959 const char *encoding,
2960 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002961{
2962 PyObject *buffer = NULL, *unicode;
2963 Py_buffer info;
2964 char lower[11]; /* Enough for any encoding shortcut */
2965
Fred Drakee4315f52000-05-09 19:53:39 +00002966 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002967 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002968 if ((strcmp(lower, "utf-8") == 0) ||
2969 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002970 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002971 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002972 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002973 (strcmp(lower, "iso-8859-1") == 0) ||
2974 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002975 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002976#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002977 else if (strcmp(lower, "mbcs") == 0)
2978 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002979#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002980 else if (strcmp(lower, "ascii") == 0)
2981 return PyUnicode_DecodeASCII(s, size, errors);
2982 else if (strcmp(lower, "utf-16") == 0)
2983 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2984 else if (strcmp(lower, "utf-32") == 0)
2985 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987
2988 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002989 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002990 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002991 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002992 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 if (buffer == NULL)
2994 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002995 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 if (unicode == NULL)
2997 goto onError;
2998 if (!PyUnicode_Check(unicode)) {
2999 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003000 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3001 "use codecs.decode() to decode to arbitrary types",
3002 encoding,
3003 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 Py_DECREF(unicode);
3005 goto onError;
3006 }
3007 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003008 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 Py_XDECREF(buffer);
3012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003034 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003035
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003037 return NULL;
3038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 const char *encoding,
3043 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044{
3045 PyObject *v;
3046
3047 if (!PyUnicode_Check(unicode)) {
3048 PyErr_BadArgument();
3049 goto onError;
3050 }
3051
3052 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003054
3055 /* Decode via the codec registry */
3056 v = PyCodec_Decode(unicode, encoding, errors);
3057 if (v == NULL)
3058 goto onError;
3059 if (!PyUnicode_Check(v)) {
3060 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003061 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3062 "use codecs.decode() to decode to arbitrary types",
3063 encoding,
3064 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003065 Py_DECREF(v);
3066 goto onError;
3067 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003068 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003069
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003071 return NULL;
3072}
3073
Alexander Belopolsky40018472011-02-26 01:02:56 +00003074PyObject *
3075PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003076 Py_ssize_t size,
3077 const char *encoding,
3078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079{
3080 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 unicode = PyUnicode_FromUnicode(s, size);
3083 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3086 Py_DECREF(unicode);
3087 return v;
3088}
3089
Alexander Belopolsky40018472011-02-26 01:02:56 +00003090PyObject *
3091PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003092 const char *encoding,
3093 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094{
3095 PyObject *v;
3096
3097 if (!PyUnicode_Check(unicode)) {
3098 PyErr_BadArgument();
3099 goto onError;
3100 }
3101
3102 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003104
3105 /* Encode via the codec registry */
3106 v = PyCodec_Encode(unicode, encoding, errors);
3107 if (v == NULL)
3108 goto onError;
3109 return v;
3110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003112 return NULL;
3113}
3114
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003115static size_t
3116wcstombs_errorpos(const wchar_t *wstr)
3117{
3118 size_t len;
3119#if SIZEOF_WCHAR_T == 2
3120 wchar_t buf[3];
3121#else
3122 wchar_t buf[2];
3123#endif
3124 char outbuf[MB_LEN_MAX];
3125 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127#if SIZEOF_WCHAR_T == 2
3128 buf[2] = 0;
3129#else
3130 buf[1] = 0;
3131#endif
3132 start = wstr;
3133 while (*wstr != L'\0')
3134 {
3135 previous = wstr;
3136#if SIZEOF_WCHAR_T == 2
3137 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3138 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3139 {
3140 buf[0] = wstr[0];
3141 buf[1] = wstr[1];
3142 wstr += 2;
3143 }
3144 else {
3145 buf[0] = *wstr;
3146 buf[1] = 0;
3147 wstr++;
3148 }
3149#else
3150 buf[0] = *wstr;
3151 wstr++;
3152#endif
3153 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003154 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003155 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156 }
3157
3158 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003159 return 0;
3160}
3161
Victor Stinner1b579672011-12-17 05:47:23 +01003162static int
3163locale_error_handler(const char *errors, int *surrogateescape)
3164{
3165 if (errors == NULL) {
3166 *surrogateescape = 0;
3167 return 0;
3168 }
3169
3170 if (strcmp(errors, "strict") == 0) {
3171 *surrogateescape = 0;
3172 return 0;
3173 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003174 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003175 *surrogateescape = 1;
3176 return 0;
3177 }
3178 PyErr_Format(PyExc_ValueError,
3179 "only 'strict' and 'surrogateescape' error handlers "
3180 "are supported, not '%s'",
3181 errors);
3182 return -1;
3183}
3184
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003185PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003186PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187{
3188 Py_ssize_t wlen, wlen2;
3189 wchar_t *wstr;
3190 PyObject *bytes = NULL;
3191 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003192 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003193 PyObject *exc;
3194 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003195 int surrogateescape;
3196
3197 if (locale_error_handler(errors, &surrogateescape) < 0)
3198 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003199
3200 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3201 if (wstr == NULL)
3202 return NULL;
3203
3204 wlen2 = wcslen(wstr);
3205 if (wlen2 != wlen) {
3206 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003207 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208 return NULL;
3209 }
3210
3211 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003212 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 char *str;
3214
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003215 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 if (str == NULL) {
3217 if (error_pos == (size_t)-1) {
3218 PyErr_NoMemory();
3219 PyMem_Free(wstr);
3220 return NULL;
3221 }
3222 else {
3223 goto encode_error;
3224 }
3225 }
3226 PyMem_Free(wstr);
3227
3228 bytes = PyBytes_FromString(str);
3229 PyMem_Free(str);
3230 }
3231 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003232 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233 size_t len, len2;
3234
3235 len = wcstombs(NULL, wstr, 0);
3236 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003237 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 goto encode_error;
3239 }
3240
3241 bytes = PyBytes_FromStringAndSize(NULL, len);
3242 if (bytes == NULL) {
3243 PyMem_Free(wstr);
3244 return NULL;
3245 }
3246
3247 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3248 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003249 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250 goto encode_error;
3251 }
3252 PyMem_Free(wstr);
3253 }
3254 return bytes;
3255
3256encode_error:
3257 errmsg = strerror(errno);
3258 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003259
3260 if (error_pos == (size_t)-1)
3261 error_pos = wcstombs_errorpos(wstr);
3262
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263 PyMem_Free(wstr);
3264 Py_XDECREF(bytes);
3265
Victor Stinner2f197072011-12-17 07:08:30 +01003266 if (errmsg != NULL) {
3267 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003268 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003269 if (wstr != NULL) {
3270 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003271 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003272 } else
3273 errmsg = NULL;
3274 }
3275 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003276 reason = PyUnicode_FromString(
3277 "wcstombs() encountered an unencodable "
3278 "wide character");
3279 if (reason == NULL)
3280 return NULL;
3281
3282 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3283 "locale", unicode,
3284 (Py_ssize_t)error_pos,
3285 (Py_ssize_t)(error_pos+1),
3286 reason);
3287 Py_DECREF(reason);
3288 if (exc != NULL) {
3289 PyCodec_StrictErrors(exc);
3290 Py_XDECREF(exc);
3291 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 return NULL;
3293}
3294
Victor Stinnerad158722010-10-27 00:25:46 +00003295PyObject *
3296PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003297{
Victor Stinner99b95382011-07-04 14:23:54 +02003298#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003299 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003300#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003302#else
Victor Stinner793b5312011-04-27 00:24:21 +02003303 PyInterpreterState *interp = PyThreadState_GET()->interp;
3304 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3305 cannot use it to encode and decode filenames before it is loaded. Load
3306 the Python codec requires to encode at least its own filename. Use the C
3307 version of the locale codec until the codec registry is initialized and
3308 the Python codec is loaded.
3309
3310 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3311 cannot only rely on it: check also interp->fscodec_initialized for
3312 subinterpreters. */
3313 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003314 return PyUnicode_AsEncodedString(unicode,
3315 Py_FileSystemDefaultEncoding,
3316 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003317 }
3318 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003319 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003320 }
Victor Stinnerad158722010-10-27 00:25:46 +00003321#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003322}
3323
Alexander Belopolsky40018472011-02-26 01:02:56 +00003324PyObject *
3325PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003326 const char *encoding,
3327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328{
3329 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003330 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (!PyUnicode_Check(unicode)) {
3333 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
Fred Drakee4315f52000-05-09 19:53:39 +00003336
Fred Drakee4315f52000-05-09 19:53:39 +00003337 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003338 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003339 if ((strcmp(lower, "utf-8") == 0) ||
3340 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003341 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003342 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003344 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003346 }
Victor Stinner37296e82010-06-10 13:36:23 +00003347 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003348 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003349 (strcmp(lower, "iso-8859-1") == 0) ||
3350 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003352#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003353 else if (strcmp(lower, "mbcs") == 0)
3354 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003355#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003356 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
3360 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003361 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003363 return NULL;
3364
3365 /* The normal path */
3366 if (PyBytes_Check(v))
3367 return v;
3368
3369 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003371 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003372 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003373
3374 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003375 "encoder %s returned bytearray instead of bytes; "
3376 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003377 encoding);
3378 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003379 Py_DECREF(v);
3380 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003382
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003383 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3384 Py_DECREF(v);
3385 return b;
3386 }
3387
3388 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003389 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3390 "use codecs.encode() to encode to arbitrary types",
3391 encoding,
3392 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003393 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003394 return NULL;
3395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
3409 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411
3412 /* Encode via the codec registry */
3413 v = PyCodec_Encode(unicode, encoding, errors);
3414 if (v == NULL)
3415 goto onError;
3416 if (!PyUnicode_Check(v)) {
3417 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003418 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3419 "use codecs.encode() to encode to arbitrary types",
3420 encoding,
3421 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 Py_DECREF(v);
3423 goto onError;
3424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003426
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 return NULL;
3429}
3430
Victor Stinner2f197072011-12-17 07:08:30 +01003431static size_t
3432mbstowcs_errorpos(const char *str, size_t len)
3433{
3434#ifdef HAVE_MBRTOWC
3435 const char *start = str;
3436 mbstate_t mbs;
3437 size_t converted;
3438 wchar_t ch;
3439
3440 memset(&mbs, 0, sizeof mbs);
3441 while (len)
3442 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003443 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003444 if (converted == 0)
3445 /* Reached end of string */
3446 break;
3447 if (converted == (size_t)-1 || converted == (size_t)-2) {
3448 /* Conversion error or incomplete character */
3449 return str - start;
3450 }
3451 else {
3452 str += converted;
3453 len -= converted;
3454 }
3455 }
3456 /* failed to find the undecodable byte sequence */
3457 return 0;
3458#endif
3459 return 0;
3460}
3461
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003462PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003463PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003464 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003465{
3466 wchar_t smallbuf[256];
3467 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3468 wchar_t *wstr;
3469 size_t wlen, wlen2;
3470 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003471 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003472 size_t error_pos;
3473 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003474 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3475 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003476
3477 if (locale_error_handler(errors, &surrogateescape) < 0)
3478 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003479
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003480 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3481 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482 return NULL;
3483 }
3484
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003485 if (surrogateescape) {
3486 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003487 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 if (wstr == NULL) {
3489 if (wlen == (size_t)-1)
3490 PyErr_NoMemory();
3491 else
3492 PyErr_SetFromErrno(PyExc_OSError);
3493 return NULL;
3494 }
3495
3496 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003497 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003498 }
3499 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003500 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501#ifndef HAVE_BROKEN_MBSTOWCS
3502 wlen = mbstowcs(NULL, str, 0);
3503#else
3504 wlen = len;
3505#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003506 if (wlen == (size_t)-1)
3507 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508 if (wlen+1 <= smallbuf_len) {
3509 wstr = smallbuf;
3510 }
3511 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003512 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003513 if (!wstr)
3514 return PyErr_NoMemory();
3515 }
3516
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003517 wlen2 = mbstowcs(wstr, str, wlen+1);
3518 if (wlen2 == (size_t)-1) {
3519 if (wstr != smallbuf)
3520 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003521 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 }
3523#ifdef HAVE_BROKEN_MBSTOWCS
3524 assert(wlen2 == wlen);
3525#endif
3526 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3527 if (wstr != smallbuf)
3528 PyMem_Free(wstr);
3529 }
3530 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003531
3532decode_error:
3533 errmsg = strerror(errno);
3534 assert(errmsg != NULL);
3535
3536 error_pos = mbstowcs_errorpos(str, len);
3537 if (errmsg != NULL) {
3538 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003539 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003540 if (wstr != NULL) {
3541 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003542 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003543 } else
3544 errmsg = NULL;
3545 }
3546 if (errmsg == NULL)
3547 reason = PyUnicode_FromString(
3548 "mbstowcs() encountered an invalid multibyte sequence");
3549 if (reason == NULL)
3550 return NULL;
3551
3552 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3553 "locale", str, len,
3554 (Py_ssize_t)error_pos,
3555 (Py_ssize_t)(error_pos+1),
3556 reason);
3557 Py_DECREF(reason);
3558 if (exc != NULL) {
3559 PyCodec_StrictErrors(exc);
3560 Py_XDECREF(exc);
3561 }
3562 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003563}
3564
3565PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003566PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003567{
3568 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003569 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003570}
3571
3572
3573PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003574PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003576 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3577}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578
Christian Heimes5894ba72007-11-04 11:43:14 +00003579PyObject*
3580PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3581{
Victor Stinner99b95382011-07-04 14:23:54 +02003582#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003583 return PyUnicode_DecodeMBCS(s, size, NULL);
3584#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003585 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003586#else
Victor Stinner793b5312011-04-27 00:24:21 +02003587 PyInterpreterState *interp = PyThreadState_GET()->interp;
3588 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3589 cannot use it to encode and decode filenames before it is loaded. Load
3590 the Python codec requires to encode at least its own filename. Use the C
3591 version of the locale codec until the codec registry is initialized and
3592 the Python codec is loaded.
3593
3594 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3595 cannot only rely on it: check also interp->fscodec_initialized for
3596 subinterpreters. */
3597 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003598 return PyUnicode_Decode(s, size,
3599 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003600 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003601 }
3602 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003603 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003604 }
Victor Stinnerad158722010-10-27 00:25:46 +00003605#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003606}
3607
Martin v. Löwis011e8422009-05-05 04:43:17 +00003608
3609int
3610PyUnicode_FSConverter(PyObject* arg, void* addr)
3611{
3612 PyObject *output = NULL;
3613 Py_ssize_t size;
3614 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003615 if (arg == NULL) {
3616 Py_DECREF(*(PyObject**)addr);
3617 return 1;
3618 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003619 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 output = arg;
3621 Py_INCREF(output);
3622 }
3623 else {
3624 arg = PyUnicode_FromObject(arg);
3625 if (!arg)
3626 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003627 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003628 Py_DECREF(arg);
3629 if (!output)
3630 return 0;
3631 if (!PyBytes_Check(output)) {
3632 Py_DECREF(output);
3633 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3634 return 0;
3635 }
3636 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003637 size = PyBytes_GET_SIZE(output);
3638 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003639 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003640 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003641 Py_DECREF(output);
3642 return 0;
3643 }
3644 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003645 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003646}
3647
3648
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003649int
3650PyUnicode_FSDecoder(PyObject* arg, void* addr)
3651{
3652 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003653 if (arg == NULL) {
3654 Py_DECREF(*(PyObject**)addr);
3655 return 1;
3656 }
3657 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003658 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003659 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003660 output = arg;
3661 Py_INCREF(output);
3662 }
3663 else {
3664 arg = PyBytes_FromObject(arg);
3665 if (!arg)
3666 return 0;
3667 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3668 PyBytes_GET_SIZE(arg));
3669 Py_DECREF(arg);
3670 if (!output)
3671 return 0;
3672 if (!PyUnicode_Check(output)) {
3673 Py_DECREF(output);
3674 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3675 return 0;
3676 }
3677 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003678 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003679 Py_DECREF(output);
3680 return 0;
3681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003682 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003683 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003684 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003685 Py_DECREF(output);
3686 return 0;
3687 }
3688 *(PyObject**)addr = output;
3689 return Py_CLEANUP_SUPPORTED;
3690}
3691
3692
Martin v. Löwis5b222132007-06-10 09:51:05 +00003693char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003694PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003695{
Christian Heimesf3863112007-11-22 07:46:41 +00003696 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003697
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003698 if (!PyUnicode_Check(unicode)) {
3699 PyErr_BadArgument();
3700 return NULL;
3701 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003703 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003705 if (PyUnicode_UTF8(unicode) == NULL) {
3706 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3708 if (bytes == NULL)
3709 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003710 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3711 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003712 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 Py_DECREF(bytes);
3714 return NULL;
3715 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003716 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3717 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3718 PyBytes_AS_STRING(bytes),
3719 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720 Py_DECREF(bytes);
3721 }
3722
3723 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003724 *psize = PyUnicode_UTF8_LENGTH(unicode);
3725 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003726}
3727
3728char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3732}
3733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734Py_UNICODE *
3735PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 const unsigned char *one_byte;
3738#if SIZEOF_WCHAR_T == 4
3739 const Py_UCS2 *two_bytes;
3740#else
3741 const Py_UCS4 *four_bytes;
3742 const Py_UCS4 *ucs4_end;
3743 Py_ssize_t num_surrogates;
3744#endif
3745 wchar_t *w;
3746 wchar_t *wchar_end;
3747
3748 if (!PyUnicode_Check(unicode)) {
3749 PyErr_BadArgument();
3750 return NULL;
3751 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 assert(_PyUnicode_KIND(unicode) != 0);
3755 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3760 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 num_surrogates = 0;
3762
3763 for (; four_bytes < ucs4_end; ++four_bytes) {
3764 if (*four_bytes > 0xFFFF)
3765 ++num_surrogates;
3766 }
3767
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3769 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3770 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 PyErr_NoMemory();
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 w = _PyUnicode_WSTR(unicode);
3777 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3778 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3780 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003781 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003783 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3784 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 }
3786 else
3787 *w = *four_bytes;
3788
3789 if (w > wchar_end) {
3790 assert(0 && "Miscalculated string end");
3791 }
3792 }
3793 *w = 0;
3794#else
3795 /* sizeof(wchar_t) == 4 */
3796 Py_FatalError("Impossible unicode object state, wstr and str "
3797 "should share memory already.");
3798 return NULL;
3799#endif
3800 }
3801 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003802 if ((size_t)_PyUnicode_LENGTH(unicode) >
3803 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3804 PyErr_NoMemory();
3805 return NULL;
3806 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003807 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3808 (_PyUnicode_LENGTH(unicode) + 1));
3809 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810 PyErr_NoMemory();
3811 return NULL;
3812 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3814 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3815 w = _PyUnicode_WSTR(unicode);
3816 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3819 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 for (; w < wchar_end; ++one_byte, ++w)
3821 *w = *one_byte;
3822 /* null-terminate the wstr */
3823 *w = 0;
3824 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003825 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 for (; w < wchar_end; ++two_bytes, ++w)
3829 *w = *two_bytes;
3830 /* null-terminate the wstr */
3831 *w = 0;
3832#else
3833 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 PyObject_FREE(_PyUnicode_WSTR(unicode));
3835 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 Py_FatalError("Impossible unicode object state, wstr "
3837 "and str should share memory already.");
3838 return NULL;
3839#endif
3840 }
3841 else {
3842 assert(0 && "This should never happen.");
3843 }
3844 }
3845 }
3846 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003847 *size = PyUnicode_WSTR_LENGTH(unicode);
3848 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003849}
3850
Alexander Belopolsky40018472011-02-26 01:02:56 +00003851Py_UNICODE *
3852PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855}
3856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857
Alexander Belopolsky40018472011-02-26 01:02:56 +00003858Py_ssize_t
3859PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860{
3861 if (!PyUnicode_Check(unicode)) {
3862 PyErr_BadArgument();
3863 goto onError;
3864 }
3865 return PyUnicode_GET_SIZE(unicode);
3866
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 return -1;
3869}
3870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871Py_ssize_t
3872PyUnicode_GetLength(PyObject *unicode)
3873{
Victor Stinner07621332012-06-16 04:53:46 +02003874 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 PyErr_BadArgument();
3876 return -1;
3877 }
Victor Stinner07621332012-06-16 04:53:46 +02003878 if (PyUnicode_READY(unicode) == -1)
3879 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 return PyUnicode_GET_LENGTH(unicode);
3881}
3882
3883Py_UCS4
3884PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3885{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003886 void *data;
3887 int kind;
3888
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003889 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3890 PyErr_BadArgument();
3891 return (Py_UCS4)-1;
3892 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003893 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003894 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 return (Py_UCS4)-1;
3896 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003897 data = PyUnicode_DATA(unicode);
3898 kind = PyUnicode_KIND(unicode);
3899 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900}
3901
3902int
3903PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3904{
3905 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003906 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return -1;
3908 }
Victor Stinner488fa492011-12-12 00:01:39 +01003909 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003910 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003911 PyErr_SetString(PyExc_IndexError, "string index out of range");
3912 return -1;
3913 }
Victor Stinner488fa492011-12-12 00:01:39 +01003914 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003915 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003916 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3917 PyErr_SetString(PyExc_ValueError, "character out of range");
3918 return -1;
3919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3921 index, ch);
3922 return 0;
3923}
3924
Alexander Belopolsky40018472011-02-26 01:02:56 +00003925const char *
3926PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003927{
Victor Stinner42cb4622010-09-01 19:39:01 +00003928 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003929}
3930
Victor Stinner554f3f02010-06-16 23:33:54 +00003931/* create or adjust a UnicodeDecodeError */
3932static void
3933make_decode_exception(PyObject **exceptionObject,
3934 const char *encoding,
3935 const char *input, Py_ssize_t length,
3936 Py_ssize_t startpos, Py_ssize_t endpos,
3937 const char *reason)
3938{
3939 if (*exceptionObject == NULL) {
3940 *exceptionObject = PyUnicodeDecodeError_Create(
3941 encoding, input, length, startpos, endpos, reason);
3942 }
3943 else {
3944 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3945 goto onError;
3946 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3947 goto onError;
3948 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3949 goto onError;
3950 }
3951 return;
3952
3953onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003954 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003955}
3956
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003957#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958/* error handling callback helper:
3959 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003960 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 and adjust various state variables.
3962 return 0 on success, -1 on error
3963*/
3964
Alexander Belopolsky40018472011-02-26 01:02:56 +00003965static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003966unicode_decode_call_errorhandler_wchar(
3967 const char *errors, PyObject **errorHandler,
3968 const char *encoding, const char *reason,
3969 const char **input, const char **inend, Py_ssize_t *startinpos,
3970 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3971 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003973 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974
3975 PyObject *restuple = NULL;
3976 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003977 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003978 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t requiredsize;
3980 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003981 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003982 wchar_t *repwstr;
3983 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003985 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3986 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003989 *errorHandler = PyCodec_LookupError(errors);
3990 if (*errorHandler == NULL)
3991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 }
3993
Victor Stinner554f3f02010-06-16 23:33:54 +00003994 make_decode_exception(exceptionObject,
3995 encoding,
3996 *input, *inend - *input,
3997 *startinpos, *endinpos,
3998 reason);
3999 if (*exceptionObject == NULL)
4000 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001
4002 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4003 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004006 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 }
4009 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004011
4012 /* Copy back the bytes variables, which might have been modified by the
4013 callback */
4014 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4015 if (!inputobj)
4016 goto onError;
4017 if (!PyBytes_Check(inputobj)) {
4018 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4019 }
4020 *input = PyBytes_AS_STRING(inputobj);
4021 insize = PyBytes_GET_SIZE(inputobj);
4022 *inend = *input + insize;
4023 /* we can DECREF safely, as the exception has another reference,
4024 so the object won't go away. */
4025 Py_DECREF(inputobj);
4026
4027 if (newpos<0)
4028 newpos = insize+newpos;
4029 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004030 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004031 goto onError;
4032 }
4033
4034 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4035 if (repwstr == NULL)
4036 goto onError;
4037 /* need more space? (at least enough for what we
4038 have+the replacement+the rest of the string (starting
4039 at the new input position), so we won't have to check space
4040 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004041 requiredsize = *outpos;
4042 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4043 goto overflow;
4044 requiredsize += repwlen;
4045 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4046 goto overflow;
4047 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004048 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004049 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004050 requiredsize = 2*outsize;
4051 if (unicode_resize(output, requiredsize) < 0)
4052 goto onError;
4053 }
4054 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4055 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004056 *endinpos = newpos;
4057 *inptr = *input + newpos;
4058
4059 /* we made it! */
4060 Py_XDECREF(restuple);
4061 return 0;
4062
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004063 overflow:
4064 PyErr_SetString(PyExc_OverflowError,
4065 "decoded result is too long for a Python string");
4066
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004067 onError:
4068 Py_XDECREF(restuple);
4069 return -1;
4070}
4071#endif /* HAVE_MBCS */
4072
4073static int
4074unicode_decode_call_errorhandler_writer(
4075 const char *errors, PyObject **errorHandler,
4076 const char *encoding, const char *reason,
4077 const char **input, const char **inend, Py_ssize_t *startinpos,
4078 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4079 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4080{
4081 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4082
4083 PyObject *restuple = NULL;
4084 PyObject *repunicode = NULL;
4085 Py_ssize_t insize;
4086 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004087 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004088 PyObject *inputobj = NULL;
4089
4090 if (*errorHandler == NULL) {
4091 *errorHandler = PyCodec_LookupError(errors);
4092 if (*errorHandler == NULL)
4093 goto onError;
4094 }
4095
4096 make_decode_exception(exceptionObject,
4097 encoding,
4098 *input, *inend - *input,
4099 *startinpos, *endinpos,
4100 reason);
4101 if (*exceptionObject == NULL)
4102 goto onError;
4103
4104 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4105 if (restuple == NULL)
4106 goto onError;
4107 if (!PyTuple_Check(restuple)) {
4108 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4109 goto onError;
4110 }
4111 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004112 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004113
4114 /* Copy back the bytes variables, which might have been modified by the
4115 callback */
4116 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4117 if (!inputobj)
4118 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004119 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004121 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004122 *input = PyBytes_AS_STRING(inputobj);
4123 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004124 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004125 /* we can DECREF safely, as the exception has another reference,
4126 so the object won't go away. */
4127 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004131 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004132 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004134 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135
Victor Stinner8f674cc2013-04-17 23:02:17 +02004136 if (PyUnicode_READY(repunicode) < 0)
4137 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004138 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004139 if (replen > 1) {
4140 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004141 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004142 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4143 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4144 goto onError;
4145 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004146 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004147 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004153 Py_XDECREF(restuple);
4154 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004158 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159}
4160
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004161/* --- UTF-7 Codec -------------------------------------------------------- */
4162
Antoine Pitrou244651a2009-05-04 18:56:13 +00004163/* See RFC2152 for details. We encode conservatively and decode liberally. */
4164
4165/* Three simple macros defining base-64. */
4166
4167/* Is c a base-64 character? */
4168
4169#define IS_BASE64(c) \
4170 (((c) >= 'A' && (c) <= 'Z') || \
4171 ((c) >= 'a' && (c) <= 'z') || \
4172 ((c) >= '0' && (c) <= '9') || \
4173 (c) == '+' || (c) == '/')
4174
4175/* given that c is a base-64 character, what is its base-64 value? */
4176
4177#define FROM_BASE64(c) \
4178 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4179 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4180 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4181 (c) == '+' ? 62 : 63)
4182
4183/* What is the base-64 character of the bottom 6 bits of n? */
4184
4185#define TO_BASE64(n) \
4186 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4187
4188/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4189 * decoded as itself. We are permissive on decoding; the only ASCII
4190 * byte not decoding to itself is the + which begins a base64
4191 * string. */
4192
4193#define DECODE_DIRECT(c) \
4194 ((c) <= 127 && (c) != '+')
4195
4196/* The UTF-7 encoder treats ASCII characters differently according to
4197 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4198 * the above). See RFC2152. This array identifies these different
4199 * sets:
4200 * 0 : "Set D"
4201 * alphanumeric and '(),-./:?
4202 * 1 : "Set O"
4203 * !"#$%&*;<=>@[]^_`{|}
4204 * 2 : "whitespace"
4205 * ht nl cr sp
4206 * 3 : special (must be base64 encoded)
4207 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4208 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004209
Tim Petersced69f82003-09-16 20:30:58 +00004210static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211char utf7_category[128] = {
4212/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4213 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4214/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4215 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4216/* sp ! " # $ % & ' ( ) * + , - . / */
4217 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4218/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4220/* @ A B C D E F G H I J K L M N O */
4221 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4222/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4224/* ` a b c d e f g h i j k l m n o */
4225 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4226/* p q r s t u v w x y z { | } ~ del */
4227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004228};
4229
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230/* ENCODE_DIRECT: this character should be encoded as itself. The
4231 * answer depends on whether we are encoding set O as itself, and also
4232 * on whether we are encoding whitespace as itself. RFC2152 makes it
4233 * clear that the answers to these questions vary between
4234 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004235
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236#define ENCODE_DIRECT(c, directO, directWS) \
4237 ((c) < 128 && (c) > 0 && \
4238 ((utf7_category[(c)] == 0) || \
4239 (directWS && (utf7_category[(c)] == 2)) || \
4240 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241
Alexander Belopolsky40018472011-02-26 01:02:56 +00004242PyObject *
4243PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004244 Py_ssize_t size,
4245 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004246{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004247 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4248}
4249
Antoine Pitrou244651a2009-05-04 18:56:13 +00004250/* The decoder. The only state we preserve is our read position,
4251 * i.e. how many characters we have consumed. So if we end in the
4252 * middle of a shift sequence we have to back off the read position
4253 * and the output to the beginning of the sequence, otherwise we lose
4254 * all the shift state (seen bits, number of bits seen, high
4255 * surrogate). */
4256
Alexander Belopolsky40018472011-02-26 01:02:56 +00004257PyObject *
4258PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004259 Py_ssize_t size,
4260 const char *errors,
4261 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004262{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004264 Py_ssize_t startinpos;
4265 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004266 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004267 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 const char *errmsg = "";
4269 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004270 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004271 unsigned int base64bits = 0;
4272 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004273 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 PyObject *errorHandler = NULL;
4275 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004277 if (size == 0) {
4278 if (consumed)
4279 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004280 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004281 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004284 _PyUnicodeWriter_Init(&writer);
4285 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286
4287 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288 e = s + size;
4289
4290 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004291 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004293 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 if (inShift) { /* in a base-64 section */
4296 if (IS_BASE64(ch)) { /* consume a base-64 character */
4297 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4298 base64bits += 6;
4299 s++;
4300 if (base64bits >= 16) {
4301 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004302 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 base64bits -= 16;
4304 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004305 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 if (surrogate) {
4307 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004308 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4309 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004310 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004311 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004313 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 }
4315 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004316 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004317 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 }
Victor Stinner551ac952011-11-29 22:58:13 +01004321 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 /* first surrogate */
4323 surrogate = outCh;
4324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004326 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 }
4329 }
4330 }
4331 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 inShift = 0;
4333 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004335 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004336 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004337 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 if (base64bits > 0) { /* left-over bits */
4340 if (base64bits >= 6) {
4341 /* We've seen at least one base-64 character */
4342 errmsg = "partial character in shift sequence";
4343 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 else {
4346 /* Some bits remain; they should be zero */
4347 if (base64buffer != 0) {
4348 errmsg = "non-zero padding bits in shift sequence";
4349 goto utf7Error;
4350 }
4351 }
4352 }
4353 if (ch != '-') {
4354 /* '-' is absorbed; other terminating
4355 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004358 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359 }
4360 }
4361 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 s++; /* consume '+' */
4364 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004373 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 }
4375 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004378 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004379 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 else {
4382 startinpos = s-starts;
4383 s++;
4384 errmsg = "unexpected special character";
4385 goto utf7Error;
4386 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004390 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 errors, &errorHandler,
4392 "utf7", errmsg,
4393 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
4397
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 /* end of string */
4399
4400 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4401 /* if we're in an inconsistent state, that's an error */
4402 if (surrogate ||
4403 (base64bits >= 6) ||
4404 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 errors, &errorHandler,
4408 "utf7", "unterminated shift sequence",
4409 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 goto onError;
4412 if (s < e)
4413 goto restart;
4414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416
4417 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004418 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004420 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004421 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004422 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004423 writer.kind, writer.data, shiftOutStart);
4424 Py_XDECREF(errorHandler);
4425 Py_XDECREF(exc);
4426 _PyUnicodeWriter_Dealloc(&writer);
4427 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004428 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004429 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 }
4431 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004432 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004434 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 Py_XDECREF(errorHandler);
4437 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 Py_XDECREF(errorHandler);
4442 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 return NULL;
4445}
4446
4447
Alexander Belopolsky40018472011-02-26 01:02:56 +00004448PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004449_PyUnicode_EncodeUTF7(PyObject *str,
4450 int base64SetO,
4451 int base64WhiteSpace,
4452 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004454 int kind;
4455 void *data;
4456 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004457 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004459 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 unsigned int base64bits = 0;
4461 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 char * out;
4463 char * start;
4464
Benjamin Petersonbac79492012-01-14 13:34:47 -05004465 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004466 return NULL;
4467 kind = PyUnicode_KIND(str);
4468 data = PyUnicode_DATA(str);
4469 len = PyUnicode_GET_LENGTH(str);
4470
4471 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004474 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004475 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004476 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004477 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478 if (v == NULL)
4479 return NULL;
4480
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004481 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004482 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004483 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Antoine Pitrou244651a2009-05-04 18:56:13 +00004485 if (inShift) {
4486 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4487 /* shifting out */
4488 if (base64bits) { /* output remaining bits */
4489 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4490 base64buffer = 0;
4491 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
4493 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 /* Characters not in the BASE64 set implicitly unshift the sequence
4495 so no '-' is required, except if the character is itself a '-' */
4496 if (IS_BASE64(ch) || ch == '-') {
4497 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 *out++ = (char) ch;
4500 }
4501 else {
4502 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004503 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 else { /* not in a shift sequence */
4506 if (ch == '+') {
4507 *out++ = '+';
4508 *out++ = '-';
4509 }
4510 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4511 *out++ = (char) ch;
4512 }
4513 else {
4514 *out++ = '+';
4515 inShift = 1;
4516 goto encode_char;
4517 }
4518 }
4519 continue;
4520encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004522 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004523
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 /* code first surrogate */
4525 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004526 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 while (base64bits >= 6) {
4528 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4529 base64bits -= 6;
4530 }
4531 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004532 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 base64bits += 16;
4535 base64buffer = (base64buffer << 16) | ch;
4536 while (base64bits >= 6) {
4537 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4538 base64bits -= 6;
4539 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004540 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 if (base64bits)
4542 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4543 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004545 if (_PyBytes_Resize(&v, out - start) < 0)
4546 return NULL;
4547 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004549PyObject *
4550PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4551 Py_ssize_t size,
4552 int base64SetO,
4553 int base64WhiteSpace,
4554 const char *errors)
4555{
4556 PyObject *result;
4557 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4558 if (tmp == NULL)
4559 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004560 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004561 base64WhiteSpace, errors);
4562 Py_DECREF(tmp);
4563 return result;
4564}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566#undef IS_BASE64
4567#undef FROM_BASE64
4568#undef TO_BASE64
4569#undef DECODE_DIRECT
4570#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572/* --- UTF-8 Codec -------------------------------------------------------- */
4573
Alexander Belopolsky40018472011-02-26 01:02:56 +00004574PyObject *
4575PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004576 Py_ssize_t size,
4577 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578{
Walter Dörwald69652032004-09-07 20:24:22 +00004579 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4580}
4581
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004582#include "stringlib/asciilib.h"
4583#include "stringlib/codecs.h"
4584#include "stringlib/undef.h"
4585
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004586#include "stringlib/ucs1lib.h"
4587#include "stringlib/codecs.h"
4588#include "stringlib/undef.h"
4589
4590#include "stringlib/ucs2lib.h"
4591#include "stringlib/codecs.h"
4592#include "stringlib/undef.h"
4593
4594#include "stringlib/ucs4lib.h"
4595#include "stringlib/codecs.h"
4596#include "stringlib/undef.h"
4597
Antoine Pitrouab868312009-01-10 15:40:25 +00004598/* Mask to quickly check whether a C 'long' contains a
4599 non-ASCII, UTF8-encoded char. */
4600#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004601# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004602#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004603# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004604#else
4605# error C 'long' size should be either 4 or 8!
4606#endif
4607
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004608static Py_ssize_t
4609ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004610{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004611 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004612 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004614 /*
4615 * Issue #17237: m68k is a bit different from most architectures in
4616 * that objects do not use "natural alignment" - for example, int and
4617 * long are only aligned at 2-byte boundaries. Therefore the assert()
4618 * won't work; also, tests have shown that skipping the "optimised
4619 * version" will even speed up m68k.
4620 */
4621#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004623 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4624 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004625 /* Fast path, see in STRINGLIB(utf8_decode) for
4626 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004627 /* Help allocation */
4628 const char *_p = p;
4629 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004630 while (_p < aligned_end) {
4631 unsigned long value = *(const unsigned long *) _p;
4632 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004634 *((unsigned long *)q) = value;
4635 _p += SIZEOF_LONG;
4636 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004637 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004638 p = _p;
4639 while (p < end) {
4640 if ((unsigned char)*p & 0x80)
4641 break;
4642 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004646#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004647#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 while (p < end) {
4649 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4650 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004651 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004652 /* Help allocation */
4653 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 while (_p < aligned_end) {
4655 unsigned long value = *(unsigned long *) _p;
4656 if (value & ASCII_CHAR_MASK)
4657 break;
4658 _p += SIZEOF_LONG;
4659 }
4660 p = _p;
4661 if (_p == end)
4662 break;
4663 }
4664 if ((unsigned char)*p & 0x80)
4665 break;
4666 ++p;
4667 }
4668 memcpy(dest, start, p - start);
4669 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670}
Antoine Pitrouab868312009-01-10 15:40:25 +00004671
Victor Stinner785938e2011-12-11 20:09:03 +01004672PyObject *
4673PyUnicode_DecodeUTF8Stateful(const char *s,
4674 Py_ssize_t size,
4675 const char *errors,
4676 Py_ssize_t *consumed)
4677{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004679 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681
4682 Py_ssize_t startinpos;
4683 Py_ssize_t endinpos;
4684 const char *errmsg = "";
4685 PyObject *errorHandler = NULL;
4686 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004687
4688 if (size == 0) {
4689 if (consumed)
4690 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004691 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004692 }
4693
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4695 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004696 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697 *consumed = 1;
4698 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004699 }
4700
Victor Stinner8f674cc2013-04-17 23:02:17 +02004701 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004702 writer.min_length = size;
4703 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004704 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004705
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 writer.pos = ascii_decode(s, end, writer.data);
4707 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708 while (s < end) {
4709 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 if (PyUnicode_IS_ASCII(writer.buffer))
4713 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004715 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004717 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 } else {
4719 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 }
4722
4723 switch (ch) {
4724 case 0:
4725 if (s == end || consumed)
4726 goto End;
4727 errmsg = "unexpected end of data";
4728 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004729 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 break;
4731 case 1:
4732 errmsg = "invalid start byte";
4733 startinpos = s - starts;
4734 endinpos = startinpos + 1;
4735 break;
4736 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004737 case 3:
4738 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 errmsg = "invalid continuation byte";
4740 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004741 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 break;
4743 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004744 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 goto onError;
4746 continue;
4747 }
4748
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004749 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004750 errors, &errorHandler,
4751 "utf-8", errmsg,
4752 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004753 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004755 }
4756
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 if (consumed)
4759 *consumed = s - starts;
4760
4761 Py_XDECREF(errorHandler);
4762 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004763 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764
4765onError:
4766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004768 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004770}
4771
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004772#ifdef __APPLE__
4773
4774/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004775 used to decode the command line arguments on Mac OS X.
4776
4777 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004778 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004779
4780wchar_t*
4781_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4782{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004783 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 wchar_t *unicode;
4785 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004786
4787 /* Note: size will always be longer than the resulting Unicode
4788 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004789 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004790 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004791 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004792 if (!unicode)
4793 return NULL;
4794
4795 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004796 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004800#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004802#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004804#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004805 if (ch > 0xFF) {
4806#if SIZEOF_WCHAR_T == 4
4807 assert(0);
4808#else
4809 assert(Py_UNICODE_IS_SURROGATE(ch));
4810 /* compute and append the two surrogates: */
4811 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4812 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4813#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004814 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815 else {
4816 if (!ch && s == e)
4817 break;
4818 /* surrogateescape */
4819 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4820 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004822 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 return unicode;
4824}
4825
4826#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828/* Primary internal function which creates utf8 encoded bytes objects.
4829
4830 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004831 and allocate exactly as much space needed at the end. Else allocate the
4832 maximum possible needed (4 result bytes per Unicode character), and return
4833 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004834*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004835PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004836_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837{
Victor Stinner6099a032011-12-18 14:22:26 +01004838 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004839 void *data;
4840 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004842 if (!PyUnicode_Check(unicode)) {
4843 PyErr_BadArgument();
4844 return NULL;
4845 }
4846
4847 if (PyUnicode_READY(unicode) == -1)
4848 return NULL;
4849
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004850 if (PyUnicode_UTF8(unicode))
4851 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4852 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004853
4854 kind = PyUnicode_KIND(unicode);
4855 data = PyUnicode_DATA(unicode);
4856 size = PyUnicode_GET_LENGTH(unicode);
4857
Benjamin Petersonead6b532011-12-20 17:23:42 -06004858 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004859 default:
4860 assert(0);
4861 case PyUnicode_1BYTE_KIND:
4862 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4863 assert(!PyUnicode_IS_ASCII(unicode));
4864 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4865 case PyUnicode_2BYTE_KIND:
4866 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4867 case PyUnicode_4BYTE_KIND:
4868 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870}
4871
Alexander Belopolsky40018472011-02-26 01:02:56 +00004872PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4874 Py_ssize_t size,
4875 const char *errors)
4876{
4877 PyObject *v, *unicode;
4878
4879 unicode = PyUnicode_FromUnicode(s, size);
4880 if (unicode == NULL)
4881 return NULL;
4882 v = _PyUnicode_AsUTF8String(unicode, errors);
4883 Py_DECREF(unicode);
4884 return v;
4885}
4886
4887PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004888PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891}
4892
Walter Dörwald41980ca2007-08-16 21:55:45 +00004893/* --- UTF-32 Codec ------------------------------------------------------- */
4894
4895PyObject *
4896PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 Py_ssize_t size,
4898 const char *errors,
4899 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900{
4901 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4902}
4903
4904PyObject *
4905PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 Py_ssize_t size,
4907 const char *errors,
4908 int *byteorder,
4909 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910{
4911 const char *starts = s;
4912 Py_ssize_t startinpos;
4913 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004914 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004915 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004916 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004917 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004918 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004919 PyObject *errorHandler = NULL;
4920 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004921
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922 q = (unsigned char *)s;
4923 e = q + size;
4924
4925 if (byteorder)
4926 bo = *byteorder;
4927
4928 /* Check for BOM marks (U+FEFF) in the input and adjust current
4929 byte order setting accordingly. In native mode, the leading BOM
4930 mark is skipped, in all other modes, it is copied to the output
4931 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004932 if (bo == 0 && size >= 4) {
4933 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4934 if (bom == 0x0000FEFF) {
4935 bo = -1;
4936 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004938 else if (bom == 0xFFFE0000) {
4939 bo = 1;
4940 q += 4;
4941 }
4942 if (byteorder)
4943 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944 }
4945
Victor Stinnere64322e2012-10-30 23:12:47 +01004946 if (q == e) {
4947 if (consumed)
4948 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004949 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 }
4951
Victor Stinnere64322e2012-10-30 23:12:47 +01004952#ifdef WORDS_BIGENDIAN
4953 le = bo < 0;
4954#else
4955 le = bo <= 0;
4956#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004957 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004958
Victor Stinner8f674cc2013-04-17 23:02:17 +02004959 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004960 writer.min_length = (e - q + 3) / 4;
4961 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004962 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004963
Victor Stinnere64322e2012-10-30 23:12:47 +01004964 while (1) {
4965 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004967
Victor Stinnere64322e2012-10-30 23:12:47 +01004968 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 enum PyUnicode_Kind kind = writer.kind;
4970 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004971 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004972 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004973 if (le) {
4974 do {
4975 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4976 if (ch > maxch)
4977 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004978 if (kind != PyUnicode_1BYTE_KIND &&
4979 Py_UNICODE_IS_SURROGATE(ch))
4980 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004982 q += 4;
4983 } while (q <= last);
4984 }
4985 else {
4986 do {
4987 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4988 if (ch > maxch)
4989 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004990 if (kind != PyUnicode_1BYTE_KIND &&
4991 Py_UNICODE_IS_SURROGATE(ch))
4992 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004994 q += 4;
4995 } while (q <= last);
4996 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004998 }
4999
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005000 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005001 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005002 startinpos = ((const char *)q) - starts;
5003 endinpos = startinpos + 4;
5004 }
5005 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005010 startinpos = ((const char *)q) - starts;
5011 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 else {
5014 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005015 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 goto onError;
5017 q += 4;
5018 continue;
5019 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005020 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005021 startinpos = ((const char *)q) - starts;
5022 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005024
5025 /* The remaining input chars are ignored if the callback
5026 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005027 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005029 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005031 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005033 }
5034
Walter Dörwald41980ca2007-08-16 21:55:45 +00005035 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038 Py_XDECREF(errorHandler);
5039 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005040 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005043 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 Py_XDECREF(errorHandler);
5045 Py_XDECREF(exc);
5046 return NULL;
5047}
5048
5049PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005050_PyUnicode_EncodeUTF32(PyObject *str,
5051 const char *errors,
5052 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005054 int kind;
5055 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005056 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005057 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005058 unsigned char *p;
5059 Py_ssize_t nsize, i;
5060 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005061#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005062 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005064 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005066 const char *encoding;
5067 PyObject *errorHandler = NULL;
5068 PyObject *exc = NULL;
5069 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070
Serhiy Storchaka30793282014-01-04 22:44:01 +02005071#define STORECHAR(CH) \
5072 do { \
5073 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5074 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5075 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5076 p[iorder[0]] = (CH) & 0xff; \
5077 p += 4; \
5078 } while(0)
5079
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005080 if (!PyUnicode_Check(str)) {
5081 PyErr_BadArgument();
5082 return NULL;
5083 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005084 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005085 return NULL;
5086 kind = PyUnicode_KIND(str);
5087 data = PyUnicode_DATA(str);
5088 len = PyUnicode_GET_LENGTH(str);
5089
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005090 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005091 if (nsize > PY_SSIZE_T_MAX / 4)
5092 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005093 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 if (v == NULL)
5095 return NULL;
5096
Serhiy Storchaka30793282014-01-04 22:44:01 +02005097 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005099 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005100 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005101 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102
Serhiy Storchaka30793282014-01-04 22:44:01 +02005103 if (byteorder == -1) {
5104 /* force LE */
5105 iorder[0] = 0;
5106 iorder[1] = 1;
5107 iorder[2] = 2;
5108 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005109 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005110 }
5111 else if (byteorder == 1) {
5112 /* force BE */
5113 iorder[0] = 3;
5114 iorder[1] = 2;
5115 iorder[2] = 1;
5116 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005117 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005118 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005119 else
5120 encoding = "utf-32";
5121
5122 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005123 for (i = 0; i < len; i++)
5124 STORECHAR(PyUnicode_READ(kind, data, i));
5125 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126 }
5127
Serhiy Storchaka30793282014-01-04 22:44:01 +02005128 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005129 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005130 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5131 i++;
5132 assert(ch <= MAX_UNICODE);
5133 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5134 STORECHAR(ch);
5135 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005136 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005137
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005138 rep = unicode_encode_call_errorhandler(
5139 errors, &errorHandler,
5140 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005141 str, &exc, i-1, i, &i);
5142
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005143 if (!rep)
5144 goto error;
5145
5146 if (PyBytes_Check(rep)) {
5147 repsize = PyBytes_GET_SIZE(rep);
5148 if (repsize & 3) {
5149 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005150 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005151 "surrogates not allowed");
5152 goto error;
5153 }
5154 moreunits = repsize / 4;
5155 }
5156 else {
5157 assert(PyUnicode_Check(rep));
5158 if (PyUnicode_READY(rep) < 0)
5159 goto error;
5160 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5161 if (!PyUnicode_IS_ASCII(rep)) {
5162 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005163 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005164 "surrogates not allowed");
5165 goto error;
5166 }
5167 }
5168
5169 /* four bytes are reserved for each surrogate */
5170 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005171 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005172 Py_ssize_t morebytes = 4 * (moreunits - 1);
5173 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5174 /* integer overflow */
5175 PyErr_NoMemory();
5176 goto error;
5177 }
5178 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5179 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005180 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005181 }
5182
5183 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005184 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5185 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005186 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005187 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005188 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005189 repdata = PyUnicode_1BYTE_DATA(rep);
5190 while (repsize--) {
5191 Py_UCS4 ch = *repdata++;
5192 STORECHAR(ch);
5193 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005194 }
5195
5196 Py_CLEAR(rep);
5197 }
5198
5199 /* Cut back to size actually needed. This is necessary for, for example,
5200 encoding of a string containing isolated surrogates and the 'ignore'
5201 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005202 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005203 if (nsize != PyBytes_GET_SIZE(v))
5204 _PyBytes_Resize(&v, nsize);
5205 Py_XDECREF(errorHandler);
5206 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005207 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005208 error:
5209 Py_XDECREF(rep);
5210 Py_XDECREF(errorHandler);
5211 Py_XDECREF(exc);
5212 Py_XDECREF(v);
5213 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005214#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215}
5216
Alexander Belopolsky40018472011-02-26 01:02:56 +00005217PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005218PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5219 Py_ssize_t size,
5220 const char *errors,
5221 int byteorder)
5222{
5223 PyObject *result;
5224 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5225 if (tmp == NULL)
5226 return NULL;
5227 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5228 Py_DECREF(tmp);
5229 return result;
5230}
5231
5232PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005233PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234{
Victor Stinnerb960b342011-11-20 19:12:52 +01005235 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236}
5237
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238/* --- UTF-16 Codec ------------------------------------------------------- */
5239
Tim Peters772747b2001-08-09 22:21:55 +00005240PyObject *
5241PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 Py_ssize_t size,
5243 const char *errors,
5244 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245{
Walter Dörwald69652032004-09-07 20:24:22 +00005246 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5247}
5248
5249PyObject *
5250PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 Py_ssize_t size,
5252 const char *errors,
5253 int *byteorder,
5254 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005255{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005256 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005257 Py_ssize_t startinpos;
5258 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005259 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005260 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005261 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005262 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005263 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 PyObject *errorHandler = NULL;
5265 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005266 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
Tim Peters772747b2001-08-09 22:21:55 +00005268 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005269 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270
5271 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005272 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005274 /* Check for BOM marks (U+FEFF) in the input and adjust current
5275 byte order setting accordingly. In native mode, the leading BOM
5276 mark is skipped, in all other modes, it is copied to the output
5277 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005278 if (bo == 0 && size >= 2) {
5279 const Py_UCS4 bom = (q[1] << 8) | q[0];
5280 if (bom == 0xFEFF) {
5281 q += 2;
5282 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005284 else if (bom == 0xFFFE) {
5285 q += 2;
5286 bo = 1;
5287 }
5288 if (byteorder)
5289 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
Antoine Pitrou63065d72012-05-15 23:48:04 +02005292 if (q == e) {
5293 if (consumed)
5294 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005295 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005296 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005297
Christian Heimes743e0cd2012-10-17 23:52:17 +02005298#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005299 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005300 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005301#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005303 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005304#endif
Tim Peters772747b2001-08-09 22:21:55 +00005305
Antoine Pitrou63065d72012-05-15 23:48:04 +02005306 /* Note: size will always be longer than the resulting Unicode
5307 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005308 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005309 writer.min_length = (e - q + 1) / 2;
5310 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005311 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005312
Antoine Pitrou63065d72012-05-15 23:48:04 +02005313 while (1) {
5314 Py_UCS4 ch = 0;
5315 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005316 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005317 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005319 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005320 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005321 native_ordering);
5322 else
5323 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005324 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005325 native_ordering);
5326 } else if (kind == PyUnicode_2BYTE_KIND) {
5327 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005328 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005329 native_ordering);
5330 } else {
5331 assert(kind == PyUnicode_4BYTE_KIND);
5332 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005333 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005334 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005335 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337
Antoine Pitrou63065d72012-05-15 23:48:04 +02005338 switch (ch)
5339 {
5340 case 0:
5341 /* remaining byte at the end? (size should be even) */
5342 if (q == e || consumed)
5343 goto End;
5344 errmsg = "truncated data";
5345 startinpos = ((const char *)q) - starts;
5346 endinpos = ((const char *)e) - starts;
5347 break;
5348 /* The remaining input chars are ignored if the callback
5349 chooses to skip the input */
5350 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005351 q -= 2;
5352 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005353 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005354 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005355 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005356 endinpos = ((const char *)e) - starts;
5357 break;
5358 case 2:
5359 errmsg = "illegal encoding";
5360 startinpos = ((const char *)q) - 2 - starts;
5361 endinpos = startinpos + 2;
5362 break;
5363 case 3:
5364 errmsg = "illegal UTF-16 surrogate";
5365 startinpos = ((const char *)q) - 4 - starts;
5366 endinpos = startinpos + 2;
5367 break;
5368 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005369 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005370 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 continue;
5372 }
5373
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005374 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 errors,
5376 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005377 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005378 &starts,
5379 (const char **)&e,
5380 &startinpos,
5381 &endinpos,
5382 &exc,
5383 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005384 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 }
5387
Antoine Pitrou63065d72012-05-15 23:48:04 +02005388End:
Walter Dörwald69652032004-09-07 20:24:22 +00005389 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 Py_XDECREF(errorHandler);
5393 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005394 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005397 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 Py_XDECREF(errorHandler);
5399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 return NULL;
5401}
5402
Tim Peters772747b2001-08-09 22:21:55 +00005403PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005404_PyUnicode_EncodeUTF16(PyObject *str,
5405 const char *errors,
5406 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005408 enum PyUnicode_Kind kind;
5409 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005410 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005411 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005412 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005413 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005414#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005415 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005416#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005417 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005418#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005419 const char *encoding;
5420 Py_ssize_t nsize, pos;
5421 PyObject *errorHandler = NULL;
5422 PyObject *exc = NULL;
5423 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005424
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425 if (!PyUnicode_Check(str)) {
5426 PyErr_BadArgument();
5427 return NULL;
5428 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005429 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005430 return NULL;
5431 kind = PyUnicode_KIND(str);
5432 data = PyUnicode_DATA(str);
5433 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005434
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005435 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 if (kind == PyUnicode_4BYTE_KIND) {
5437 const Py_UCS4 *in = (const Py_UCS4 *)data;
5438 const Py_UCS4 *end = in + len;
5439 while (in < end)
5440 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005441 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005442 }
5443 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005445 nsize = len + pairs + (byteorder == 0);
5446 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 if (v == NULL)
5448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005450 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005451 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005454 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005455 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005456 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005457
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 if (kind == PyUnicode_1BYTE_KIND) {
5459 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5460 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005461 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005462
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005463 if (byteorder < 0)
5464 encoding = "utf-16-le";
5465 else if (byteorder > 0)
5466 encoding = "utf-16-be";
5467 else
5468 encoding = "utf-16";
5469
5470 pos = 0;
5471 while (pos < len) {
5472 Py_ssize_t repsize, moreunits;
5473
5474 if (kind == PyUnicode_2BYTE_KIND) {
5475 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5476 &out, native_ordering);
5477 }
5478 else {
5479 assert(kind == PyUnicode_4BYTE_KIND);
5480 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5481 &out, native_ordering);
5482 }
5483 if (pos == len)
5484 break;
5485
5486 rep = unicode_encode_call_errorhandler(
5487 errors, &errorHandler,
5488 encoding, "surrogates not allowed",
5489 str, &exc, pos, pos + 1, &pos);
5490 if (!rep)
5491 goto error;
5492
5493 if (PyBytes_Check(rep)) {
5494 repsize = PyBytes_GET_SIZE(rep);
5495 if (repsize & 1) {
5496 raise_encode_exception(&exc, encoding,
5497 str, pos - 1, pos,
5498 "surrogates not allowed");
5499 goto error;
5500 }
5501 moreunits = repsize / 2;
5502 }
5503 else {
5504 assert(PyUnicode_Check(rep));
5505 if (PyUnicode_READY(rep) < 0)
5506 goto error;
5507 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5508 if (!PyUnicode_IS_ASCII(rep)) {
5509 raise_encode_exception(&exc, encoding,
5510 str, pos - 1, pos,
5511 "surrogates not allowed");
5512 goto error;
5513 }
5514 }
5515
5516 /* two bytes are reserved for each surrogate */
5517 if (moreunits > 1) {
5518 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5519 Py_ssize_t morebytes = 2 * (moreunits - 1);
5520 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5521 /* integer overflow */
5522 PyErr_NoMemory();
5523 goto error;
5524 }
5525 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5526 goto error;
5527 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5528 }
5529
5530 if (PyBytes_Check(rep)) {
5531 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5532 out += moreunits;
5533 } else /* rep is unicode */ {
5534 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5535 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5536 &out, native_ordering);
5537 }
5538
5539 Py_CLEAR(rep);
5540 }
5541
5542 /* Cut back to size actually needed. This is necessary for, for example,
5543 encoding of a string containing isolated surrogates and the 'ignore' handler
5544 is used. */
5545 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5546 if (nsize != PyBytes_GET_SIZE(v))
5547 _PyBytes_Resize(&v, nsize);
5548 Py_XDECREF(errorHandler);
5549 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005550 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005551 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005552 error:
5553 Py_XDECREF(rep);
5554 Py_XDECREF(errorHandler);
5555 Py_XDECREF(exc);
5556 Py_XDECREF(v);
5557 return NULL;
5558#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
Alexander Belopolsky40018472011-02-26 01:02:56 +00005561PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005562PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5563 Py_ssize_t size,
5564 const char *errors,
5565 int byteorder)
5566{
5567 PyObject *result;
5568 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5569 if (tmp == NULL)
5570 return NULL;
5571 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5572 Py_DECREF(tmp);
5573 return result;
5574}
5575
5576PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005577PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005579 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580}
5581
5582/* --- Unicode Escape Codec ----------------------------------------------- */
5583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5585 if all the escapes in the string make it still a valid ASCII string.
5586 Returns -1 if any escapes were found which cause the string to
5587 pop out of ASCII range. Otherwise returns the length of the
5588 required buffer to hold the string.
5589 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005590static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5592{
5593 const unsigned char *p = (const unsigned char *)s;
5594 const unsigned char *end = p + size;
5595 Py_ssize_t length = 0;
5596
5597 if (size < 0)
5598 return -1;
5599
5600 for (; p < end; ++p) {
5601 if (*p > 127) {
5602 /* Non-ASCII */
5603 return -1;
5604 }
5605 else if (*p != '\\') {
5606 /* Normal character */
5607 ++length;
5608 }
5609 else {
5610 /* Backslash-escape, check next char */
5611 ++p;
5612 /* Escape sequence reaches till end of string or
5613 non-ASCII follow-up. */
5614 if (p >= end || *p > 127)
5615 return -1;
5616 switch (*p) {
5617 case '\n':
5618 /* backslash + \n result in zero characters */
5619 break;
5620 case '\\': case '\'': case '\"':
5621 case 'b': case 'f': case 't':
5622 case 'n': case 'r': case 'v': case 'a':
5623 ++length;
5624 break;
5625 case '0': case '1': case '2': case '3':
5626 case '4': case '5': case '6': case '7':
5627 case 'x': case 'u': case 'U': case 'N':
5628 /* these do not guarantee ASCII characters */
5629 return -1;
5630 default:
5631 /* count the backslash + the other character */
5632 length += 2;
5633 }
5634 }
5635 }
5636 return length;
5637}
5638
Fredrik Lundh06d12682001-01-24 07:59:11 +00005639static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005640
Alexander Belopolsky40018472011-02-26 01:02:56 +00005641PyObject *
5642PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005643 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005644 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005647 Py_ssize_t startinpos;
5648 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005649 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005651 char* message;
5652 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 PyObject *errorHandler = NULL;
5654 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005656
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005657 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005658 if (len == 0)
5659 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660
5661 /* After length_of_escaped_ascii_string() there are two alternatives,
5662 either the string is pure ASCII with named escapes like \n, etc.
5663 and we determined it's exact size (common case)
5664 or it contains \x, \u, ... escape sequences. then we create a
5665 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005666 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005667 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005668 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669 }
5670 else {
5671 /* Escaped strings will always be longer than the resulting
5672 Unicode string, so we start with size here and then reduce the
5673 length after conversion to the true value.
5674 (but if the error callback returns a long replacement string
5675 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005676 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 }
5678
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005680 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 while (s < end) {
5684 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005685 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
5688 /* Non-escape characters are interpreted as Unicode ordinals */
5689 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005690 x = (unsigned char)*s;
5691 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005692 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005693 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 continue;
5695 }
5696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 /* \ - Escapes */
5699 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005700 c = *s++;
5701 if (s > end)
5702 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005704 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707#define WRITECHAR(ch) \
5708 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005709 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005711 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005714 case '\\': WRITECHAR('\\'); break;
5715 case '\'': WRITECHAR('\''); break;
5716 case '\"': WRITECHAR('\"'); break;
5717 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719 case 'f': WRITECHAR('\014'); break;
5720 case 't': WRITECHAR('\t'); break;
5721 case 'n': WRITECHAR('\n'); break;
5722 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005724 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005725 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005726 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 case '0': case '1': case '2': case '3':
5730 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005731 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005732 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005733 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005734 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005735 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005737 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 break;
5739
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 /* hex escapes */
5741 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005743 digits = 2;
5744 message = "truncated \\xXX escape";
5745 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005749 digits = 4;
5750 message = "truncated \\uXXXX escape";
5751 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005754 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005755 digits = 8;
5756 message = "truncated \\UXXXXXXXX escape";
5757 hexescape:
5758 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005759 if (end - s < digits) {
5760 /* count only hex digits */
5761 for (; s < end; ++s) {
5762 c = (unsigned char)*s;
5763 if (!Py_ISXDIGIT(c))
5764 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005765 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005766 goto error;
5767 }
5768 for (; digits--; ++s) {
5769 c = (unsigned char)*s;
5770 if (!Py_ISXDIGIT(c))
5771 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 chr = (chr<<4) & ~0xF;
5773 if (c >= '0' && c <= '9')
5774 chr += c - '0';
5775 else if (c >= 'a' && c <= 'f')
5776 chr += 10 + c - 'a';
5777 else
5778 chr += 10 + c - 'A';
5779 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005780 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 /* _decoding_error will have already written into the
5782 target buffer. */
5783 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005784 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005785 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005786 message = "illegal Unicode character";
5787 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005788 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005789 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005790 break;
5791
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 case 'N':
5794 message = "malformed \\N character escape";
5795 if (ucnhash_CAPI == NULL) {
5796 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5798 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005799 if (ucnhash_CAPI == NULL)
5800 goto ucnhashError;
5801 }
5802 if (*s == '{') {
5803 const char *start = s+1;
5804 /* look for the closing brace */
5805 while (*s != '}' && s < end)
5806 s++;
5807 if (s > start && s < end && *s == '}') {
5808 /* found a name. look it up in the unicode database */
5809 message = "unknown Unicode character name";
5810 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005811 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005812 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005813 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005814 goto store;
5815 }
5816 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005817 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005818
5819 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005820 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 message = "\\ at end of string";
5822 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005823 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005824 }
5825 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005826 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005827 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005828 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005831 continue;
5832
5833 error:
5834 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005835 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005836 errors, &errorHandler,
5837 "unicodeescape", message,
5838 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005839 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005840 goto onError;
5841 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005843#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005844
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005847 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005848
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005850 PyErr_SetString(
5851 PyExc_UnicodeError,
5852 "\\N escapes not supported (can't load unicodedata module)"
5853 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005854 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 Py_XDECREF(errorHandler);
5856 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005857 return NULL;
5858
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005860 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 return NULL;
5864}
5865
5866/* Return a Unicode-Escape string version of the Unicode object.
5867
5868 If quotes is true, the string is enclosed in u"" or u'' quotes as
5869 appropriate.
5870
5871*/
5872
Alexander Belopolsky40018472011-02-26 01:02:56 +00005873PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005876 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005877 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879 int kind;
5880 void *data;
5881 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882
Ezio Melottie7f90372012-10-05 03:33:31 +03005883 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005884 escape.
5885
Ezio Melottie7f90372012-10-05 03:33:31 +03005886 For UCS1 strings it's '\xxx', 4 bytes per source character.
5887 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5888 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005889 */
5890
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 if (!PyUnicode_Check(unicode)) {
5892 PyErr_BadArgument();
5893 return NULL;
5894 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005895 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 return NULL;
5897 len = PyUnicode_GET_LENGTH(unicode);
5898 kind = PyUnicode_KIND(unicode);
5899 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005900 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005901 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5902 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5903 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5904 }
5905
5906 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005907 return PyBytes_FromStringAndSize(NULL, 0);
5908
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005909 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005911
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005912 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 if (repr == NULL)
5917 return NULL;
5918
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005921 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005922 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005923
Walter Dörwald79e913e2007-05-12 11:08:06 +00005924 /* Escape backslashes */
5925 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 *p++ = '\\';
5927 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005928 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005929 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005930
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005931 /* Map 21-bit characters to '\U00xxxxxx' */
5932 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005933 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005934 *p++ = '\\';
5935 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005936 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5937 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5938 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5939 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5940 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5941 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5942 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5943 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005945 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005946
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005948 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 *p++ = '\\';
5950 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005951 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5952 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5953 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5954 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005956
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005957 /* Map special whitespace to '\t', \n', '\r' */
5958 else if (ch == '\t') {
5959 *p++ = '\\';
5960 *p++ = 't';
5961 }
5962 else if (ch == '\n') {
5963 *p++ = '\\';
5964 *p++ = 'n';
5965 }
5966 else if (ch == '\r') {
5967 *p++ = '\\';
5968 *p++ = 'r';
5969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005971 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005972 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005974 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005975 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5976 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005977 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005978
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 /* Copy everything else as-is */
5980 else
5981 *p++ = (char) ch;
5982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005984 assert(p - PyBytes_AS_STRING(repr) > 0);
5985 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5986 return NULL;
5987 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988}
5989
Alexander Belopolsky40018472011-02-26 01:02:56 +00005990PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005991PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005994 PyObject *result;
5995 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5996 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005998 result = PyUnicode_AsUnicodeEscapeString(tmp);
5999 Py_DECREF(tmp);
6000 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001}
6002
6003/* --- Raw Unicode Escape Codec ------------------------------------------- */
6004
Alexander Belopolsky40018472011-02-26 01:02:56 +00006005PyObject *
6006PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006007 Py_ssize_t size,
6008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006011 Py_ssize_t startinpos;
6012 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006013 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 const char *end;
6015 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 PyObject *errorHandler = NULL;
6017 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006018
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006019 if (size == 0)
6020 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 /* Escaped strings will always be longer than the resulting
6023 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 length after conversion to the true value. (But decoding error
6025 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006026 _PyUnicodeWriter_Init(&writer);
6027 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006028
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 end = s + size;
6030 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 unsigned char c;
6032 Py_UCS4 x;
6033 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006034 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 /* Non-escape characters are interpreted as Unicode ordinals */
6037 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006038 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006039 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006042 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 startinpos = s-starts;
6044
6045 /* \u-escapes are only interpreted iff the number of leading
6046 backslashes if odd */
6047 bs = s;
6048 for (;s < end;) {
6049 if (*s != '\\')
6050 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006051 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006052 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006053 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 }
6055 if (((s - bs) & 1) == 0 ||
6056 s >= end ||
6057 (*s != 'u' && *s != 'U')) {
6058 continue;
6059 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006060 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 count = *s=='u' ? 4 : 8;
6062 s++;
6063
6064 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 for (x = 0, i = 0; i < count; ++i, ++s) {
6066 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006067 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006069 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 errors, &errorHandler,
6071 "rawunicodeescape", "truncated \\uXXXX",
6072 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006073 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 goto onError;
6075 goto nextByte;
6076 }
6077 x = (x<<4) & ~0xF;
6078 if (c >= '0' && c <= '9')
6079 x += c - '0';
6080 else if (c >= 'a' && c <= 'f')
6081 x += 10 + c - 'a';
6082 else
6083 x += 10 + c - 'A';
6084 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006085 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006086 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006087 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 }
6089 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006090 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006091 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006092 errors, &errorHandler,
6093 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006095 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006097 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 nextByte:
6099 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 Py_XDECREF(errorHandler);
6102 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006103 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006104
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006106 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 Py_XDECREF(errorHandler);
6108 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 return NULL;
6110}
6111
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006112
Alexander Belopolsky40018472011-02-26 01:02:56 +00006113PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006114PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006116 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 char *p;
6118 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119 Py_ssize_t expandsize, pos;
6120 int kind;
6121 void *data;
6122 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124 if (!PyUnicode_Check(unicode)) {
6125 PyErr_BadArgument();
6126 return NULL;
6127 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006128 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129 return NULL;
6130 kind = PyUnicode_KIND(unicode);
6131 data = PyUnicode_DATA(unicode);
6132 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006133 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6134 bytes, and 1 byte characters 4. */
6135 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006136
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006139
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 if (repr == NULL)
6142 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006144 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006146 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 for (pos = 0; pos < len; pos++) {
6148 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 /* Map 32-bit characters to '\Uxxxxxxxx' */
6150 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006151 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006152 *p++ = '\\';
6153 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006154 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6155 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6156 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6157 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6160 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6161 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006162 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 *p++ = '\\';
6166 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006167 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6168 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6169 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6170 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 /* Copy everything else as-is */
6173 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 *p++ = (char) ch;
6175 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 assert(p > q);
6178 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006179 return NULL;
6180 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181}
6182
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6185 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 PyObject *result;
6188 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6189 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006190 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6192 Py_DECREF(tmp);
6193 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194}
6195
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006196/* --- Unicode Internal Codec ------------------------------------------- */
6197
Alexander Belopolsky40018472011-02-26 01:02:56 +00006198PyObject *
6199_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006200 Py_ssize_t size,
6201 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006202{
6203 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006204 Py_ssize_t startinpos;
6205 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006206 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006207 const char *end;
6208 const char *reason;
6209 PyObject *errorHandler = NULL;
6210 PyObject *exc = NULL;
6211
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006212 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006213 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006214 1))
6215 return NULL;
6216
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006217 if (size == 0)
6218 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006219
Victor Stinner8f674cc2013-04-17 23:02:17 +02006220 _PyUnicodeWriter_Init(&writer);
6221 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6222 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006224 }
6225 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006226
Victor Stinner8f674cc2013-04-17 23:02:17 +02006227 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006228 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006229 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006230 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006231 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006232 endinpos = end-starts;
6233 reason = "truncated input";
6234 goto error;
6235 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006236 /* We copy the raw representation one byte at a time because the
6237 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006238 ((char *) &uch)[0] = s[0];
6239 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006240#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006241 ((char *) &uch)[2] = s[2];
6242 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006243#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006244 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006245#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006246 /* We have to sanity check the raw data, otherwise doom looms for
6247 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006248 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006249 endinpos = s - starts + Py_UNICODE_SIZE;
6250 reason = "illegal code point (> 0x10FFFF)";
6251 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006253#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 s += Py_UNICODE_SIZE;
6255#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006256 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006257 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006258 Py_UNICODE uch2;
6259 ((char *) &uch2)[0] = s[0];
6260 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006261 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006262 {
Victor Stinner551ac952011-11-29 22:58:13 +01006263 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006264 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006265 }
6266 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006267#endif
6268
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006269 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006270 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006271 continue;
6272
6273 error:
6274 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006275 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006276 errors, &errorHandler,
6277 "unicode_internal", reason,
6278 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006279 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006280 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281 }
6282
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 Py_XDECREF(errorHandler);
6284 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006285 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006288 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006289 Py_XDECREF(errorHandler);
6290 Py_XDECREF(exc);
6291 return NULL;
6292}
6293
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294/* --- Latin-1 Codec ------------------------------------------------------ */
6295
Alexander Belopolsky40018472011-02-26 01:02:56 +00006296PyObject *
6297PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006298 Py_ssize_t size,
6299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006302 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303}
6304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006305/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306static void
6307make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006308 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006309 PyObject *unicode,
6310 Py_ssize_t startpos, Py_ssize_t endpos,
6311 const char *reason)
6312{
6313 if (*exceptionObject == NULL) {
6314 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006315 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006316 encoding, unicode, startpos, endpos, reason);
6317 }
6318 else {
6319 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6320 goto onError;
6321 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6322 goto onError;
6323 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6324 goto onError;
6325 return;
6326 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006327 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006328 }
6329}
6330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006331/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006332static void
6333raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006334 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006335 PyObject *unicode,
6336 Py_ssize_t startpos, Py_ssize_t endpos,
6337 const char *reason)
6338{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006339 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006340 encoding, unicode, startpos, endpos, reason);
6341 if (*exceptionObject != NULL)
6342 PyCodec_StrictErrors(*exceptionObject);
6343}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344
6345/* error handling callback helper:
6346 build arguments, call the callback and check the arguments,
6347 put the result into newpos and return the replacement string, which
6348 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006349static PyObject *
6350unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006351 PyObject **errorHandler,
6352 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006353 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 Py_ssize_t startpos, Py_ssize_t endpos,
6355 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006357 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 PyObject *restuple;
6360 PyObject *resunicode;
6361
6362 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366 }
6367
Benjamin Petersonbac79492012-01-14 13:34:47 -05006368 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006369 return NULL;
6370 len = PyUnicode_GET_LENGTH(unicode);
6371
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006372 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006376
6377 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006382 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 Py_DECREF(restuple);
6384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006386 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 &resunicode, newpos)) {
6388 Py_DECREF(restuple);
6389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006391 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6392 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6393 Py_DECREF(restuple);
6394 return NULL;
6395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006397 *newpos = len + *newpos;
6398 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006399 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 Py_DECREF(restuple);
6401 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006402 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403 Py_INCREF(resunicode);
6404 Py_DECREF(restuple);
6405 return resunicode;
6406}
6407
Alexander Belopolsky40018472011-02-26 01:02:56 +00006408static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006410 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006411 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 /* input state */
6414 Py_ssize_t pos=0, size;
6415 int kind;
6416 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 /* output object */
6418 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 /* pointer into the output */
6420 char *str;
6421 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006422 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006423 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6424 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 PyObject *errorHandler = NULL;
6426 PyObject *exc = NULL;
6427 /* the following variable is used for caching string comparisons
6428 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6429 int known_errorHandler = -1;
6430
Benjamin Petersonbac79492012-01-14 13:34:47 -05006431 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006432 return NULL;
6433 size = PyUnicode_GET_LENGTH(unicode);
6434 kind = PyUnicode_KIND(unicode);
6435 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 /* allocate enough for a simple encoding without
6437 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006438 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006439 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006440 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006442 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006443 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 ressize = size;
6445
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006446 while (pos < size) {
6447 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 /* can we encode this? */
6450 if (c<limit) {
6451 /* no overflow check, because we know that the space is enough */
6452 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006454 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 Py_ssize_t requiredsize;
6457 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006458 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006460 Py_ssize_t collstart = pos;
6461 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006463 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 ++collend;
6465 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6466 if (known_errorHandler==-1) {
6467 if ((errors==NULL) || (!strcmp(errors, "strict")))
6468 known_errorHandler = 1;
6469 else if (!strcmp(errors, "replace"))
6470 known_errorHandler = 2;
6471 else if (!strcmp(errors, "ignore"))
6472 known_errorHandler = 3;
6473 else if (!strcmp(errors, "xmlcharrefreplace"))
6474 known_errorHandler = 4;
6475 else
6476 known_errorHandler = 0;
6477 }
6478 switch (known_errorHandler) {
6479 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006480 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 goto onError;
6482 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006483 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 *str++ = '?'; /* fall through */
6485 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 break;
6488 case 4: /* xmlcharrefreplace */
6489 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006490 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006492 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006494 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006496 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006498 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006499 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006500 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006502 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006504 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006506 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006507 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006508 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006509 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006510 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006511 if (requiredsize > PY_SSIZE_T_MAX - incr)
6512 goto overflow;
6513 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006515 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6516 goto overflow;
6517 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006519 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 requiredsize = 2*ressize;
6521 if (_PyBytes_Resize(&res, requiredsize))
6522 goto onError;
6523 str = PyBytes_AS_STRING(res) + respos;
6524 ressize = requiredsize;
6525 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 /* generate replacement */
6527 for (i = collstart; i < collend; ++i) {
6528 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 break;
6532 default:
6533 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 encoding, reason, unicode, &exc,
6535 collstart, collend, &newpos);
6536 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006537 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006539 if (PyBytes_Check(repunicode)) {
6540 /* Directly copy bytes result to output. */
6541 repsize = PyBytes_Size(repunicode);
6542 if (repsize > 1) {
6543 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006544 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006545 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6546 Py_DECREF(repunicode);
6547 goto overflow;
6548 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006549 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6550 Py_DECREF(repunicode);
6551 goto onError;
6552 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006553 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006554 ressize += repsize-1;
6555 }
6556 memcpy(str, PyBytes_AsString(repunicode), repsize);
6557 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006558 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006559 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006560 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006561 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 /* need more space? (at least enough for what we
6563 have+the replacement+the rest of the string, so
6564 we won't have to check space for encodable characters) */
6565 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006567 requiredsize = respos;
6568 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6569 goto overflow;
6570 requiredsize += repsize;
6571 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6572 goto overflow;
6573 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006575 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 requiredsize = 2*ressize;
6577 if (_PyBytes_Resize(&res, requiredsize)) {
6578 Py_DECREF(repunicode);
6579 goto onError;
6580 }
6581 str = PyBytes_AS_STRING(res) + respos;
6582 ressize = requiredsize;
6583 }
6584 /* check if there is anything unencodable in the replacement
6585 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006586 for (i = 0; repsize-->0; ++i, ++str) {
6587 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006589 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006590 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 Py_DECREF(repunicode);
6592 goto onError;
6593 }
6594 *str = (char)c;
6595 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006596 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006597 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006599 }
6600 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006601 /* Resize if we allocated to much */
6602 size = str - PyBytes_AS_STRING(res);
6603 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006604 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006605 if (_PyBytes_Resize(&res, size) < 0)
6606 goto onError;
6607 }
6608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609 Py_XDECREF(errorHandler);
6610 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006611 return res;
6612
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006613 overflow:
6614 PyErr_SetString(PyExc_OverflowError,
6615 "encoded result is too long for a Python string");
6616
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006617 onError:
6618 Py_XDECREF(res);
6619 Py_XDECREF(errorHandler);
6620 Py_XDECREF(exc);
6621 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622}
6623
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625PyObject *
6626PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006627 Py_ssize_t size,
6628 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630 PyObject *result;
6631 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6632 if (unicode == NULL)
6633 return NULL;
6634 result = unicode_encode_ucs1(unicode, errors, 256);
6635 Py_DECREF(unicode);
6636 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
Alexander Belopolsky40018472011-02-26 01:02:56 +00006639PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006640_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641{
6642 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 PyErr_BadArgument();
6644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006646 if (PyUnicode_READY(unicode) == -1)
6647 return NULL;
6648 /* Fast path: if it is a one-byte string, construct
6649 bytes object directly. */
6650 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6651 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6652 PyUnicode_GET_LENGTH(unicode));
6653 /* Non-Latin-1 characters present. Defer to above function to
6654 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006656}
6657
6658PyObject*
6659PyUnicode_AsLatin1String(PyObject *unicode)
6660{
6661 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
6664/* --- 7-bit ASCII Codec -------------------------------------------------- */
6665
Alexander Belopolsky40018472011-02-26 01:02:56 +00006666PyObject *
6667PyUnicode_DecodeASCII(const char *s,
6668 Py_ssize_t size,
6669 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006672 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006673 int kind;
6674 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 Py_ssize_t startinpos;
6676 Py_ssize_t endinpos;
6677 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006678 const char *e;
6679 PyObject *errorHandler = NULL;
6680 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006681
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006683 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006686 if (size == 1 && (unsigned char)s[0] < 128)
6687 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006688
Victor Stinner8f674cc2013-04-17 23:02:17 +02006689 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006690 writer.min_length = size;
6691 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006692 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006695 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006696 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006697 writer.pos = outpos;
6698 if (writer.pos == size)
6699 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006700
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006701 s += writer.pos;
6702 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006704 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006706 PyUnicode_WRITE(kind, data, writer.pos, c);
6707 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 ++s;
6709 }
6710 else {
6711 startinpos = s-starts;
6712 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006713 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 errors, &errorHandler,
6715 "ascii", "ordinal not in range(128)",
6716 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006717 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006719 kind = writer.kind;
6720 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 Py_XDECREF(errorHandler);
6724 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006726
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006728 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 Py_XDECREF(errorHandler);
6730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 return NULL;
6732}
6733
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006735PyObject *
6736PyUnicode_EncodeASCII(const Py_UNICODE *p,
6737 Py_ssize_t size,
6738 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 PyObject *result;
6741 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6742 if (unicode == NULL)
6743 return NULL;
6744 result = unicode_encode_ucs1(unicode, errors, 128);
6745 Py_DECREF(unicode);
6746 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747}
6748
Alexander Belopolsky40018472011-02-26 01:02:56 +00006749PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006750_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751{
6752 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 PyErr_BadArgument();
6754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006756 if (PyUnicode_READY(unicode) == -1)
6757 return NULL;
6758 /* Fast path: if it is an ASCII-only string, construct bytes object
6759 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006760 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006761 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6762 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006764}
6765
6766PyObject *
6767PyUnicode_AsASCIIString(PyObject *unicode)
6768{
6769 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770}
6771
Victor Stinner99b95382011-07-04 14:23:54 +02006772#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006773
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006774/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006775
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006776#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777#define NEED_RETRY
6778#endif
6779
Victor Stinner3a50e702011-10-18 21:21:00 +02006780#ifndef WC_ERR_INVALID_CHARS
6781# define WC_ERR_INVALID_CHARS 0x0080
6782#endif
6783
6784static char*
6785code_page_name(UINT code_page, PyObject **obj)
6786{
6787 *obj = NULL;
6788 if (code_page == CP_ACP)
6789 return "mbcs";
6790 if (code_page == CP_UTF7)
6791 return "CP_UTF7";
6792 if (code_page == CP_UTF8)
6793 return "CP_UTF8";
6794
6795 *obj = PyBytes_FromFormat("cp%u", code_page);
6796 if (*obj == NULL)
6797 return NULL;
6798 return PyBytes_AS_STRING(*obj);
6799}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800
Victor Stinner3a50e702011-10-18 21:21:00 +02006801static DWORD
6802decode_code_page_flags(UINT code_page)
6803{
6804 if (code_page == CP_UTF7) {
6805 /* The CP_UTF7 decoder only supports flags=0 */
6806 return 0;
6807 }
6808 else
6809 return MB_ERR_INVALID_CHARS;
6810}
6811
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006812/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006813 * Decode a byte string from a Windows code page into unicode object in strict
6814 * mode.
6815 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006816 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6817 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006819static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006820decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006821 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006822 const char *in,
6823 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824{
Victor Stinner3a50e702011-10-18 21:21:00 +02006825 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006826 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006827 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828
6829 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006830 assert(insize > 0);
6831 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6832 if (outsize <= 0)
6833 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006834
6835 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006837 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006838 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 if (*v == NULL)
6840 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006841 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842 }
6843 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006845 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006846 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006848 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849 }
6850
6851 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006852 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6853 if (outsize <= 0)
6854 goto error;
6855 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006856
Victor Stinner3a50e702011-10-18 21:21:00 +02006857error:
6858 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6859 return -2;
6860 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006861 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862}
6863
Victor Stinner3a50e702011-10-18 21:21:00 +02006864/*
6865 * Decode a byte string from a code page into unicode object with an error
6866 * handler.
6867 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006868 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 * UnicodeDecodeError exception and returns -1 on error.
6870 */
6871static int
6872decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006873 PyObject **v,
6874 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006875 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006876{
6877 const char *startin = in;
6878 const char *endin = in + size;
6879 const DWORD flags = decode_code_page_flags(code_page);
6880 /* Ideally, we should get reason from FormatMessage. This is the Windows
6881 2000 English version of the message. */
6882 const char *reason = "No mapping for the Unicode character exists "
6883 "in the target code page.";
6884 /* each step cannot decode more than 1 character, but a character can be
6885 represented as a surrogate pair */
6886 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006887 int insize;
6888 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006889 PyObject *errorHandler = NULL;
6890 PyObject *exc = NULL;
6891 PyObject *encoding_obj = NULL;
6892 char *encoding;
6893 DWORD err;
6894 int ret = -1;
6895
6896 assert(size > 0);
6897
6898 encoding = code_page_name(code_page, &encoding_obj);
6899 if (encoding == NULL)
6900 return -1;
6901
Victor Stinner7d00cc12014-03-17 23:08:06 +01006902 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006903 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6904 UnicodeDecodeError. */
6905 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6906 if (exc != NULL) {
6907 PyCodec_StrictErrors(exc);
6908 Py_CLEAR(exc);
6909 }
6910 goto error;
6911 }
6912
6913 if (*v == NULL) {
6914 /* Create unicode object */
6915 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6916 PyErr_NoMemory();
6917 goto error;
6918 }
Victor Stinnerab595942011-12-17 04:59:06 +01006919 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006920 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 if (*v == NULL)
6922 goto error;
6923 startout = PyUnicode_AS_UNICODE(*v);
6924 }
6925 else {
6926 /* Extend unicode object */
6927 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6928 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6929 PyErr_NoMemory();
6930 goto error;
6931 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006932 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 goto error;
6934 startout = PyUnicode_AS_UNICODE(*v) + n;
6935 }
6936
6937 /* Decode the byte string character per character */
6938 out = startout;
6939 while (in < endin)
6940 {
6941 /* Decode a character */
6942 insize = 1;
6943 do
6944 {
6945 outsize = MultiByteToWideChar(code_page, flags,
6946 in, insize,
6947 buffer, Py_ARRAY_LENGTH(buffer));
6948 if (outsize > 0)
6949 break;
6950 err = GetLastError();
6951 if (err != ERROR_NO_UNICODE_TRANSLATION
6952 && err != ERROR_INSUFFICIENT_BUFFER)
6953 {
6954 PyErr_SetFromWindowsErr(0);
6955 goto error;
6956 }
6957 insize++;
6958 }
6959 /* 4=maximum length of a UTF-8 sequence */
6960 while (insize <= 4 && (in + insize) <= endin);
6961
6962 if (outsize <= 0) {
6963 Py_ssize_t startinpos, endinpos, outpos;
6964
Victor Stinner7d00cc12014-03-17 23:08:06 +01006965 /* last character in partial decode? */
6966 if (in + insize >= endin && !final)
6967 break;
6968
Victor Stinner3a50e702011-10-18 21:21:00 +02006969 startinpos = in - startin;
6970 endinpos = startinpos + 1;
6971 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006972 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 errors, &errorHandler,
6974 encoding, reason,
6975 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006976 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006977 {
6978 goto error;
6979 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006980 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 }
6982 else {
6983 in += insize;
6984 memcpy(out, buffer, outsize * sizeof(wchar_t));
6985 out += outsize;
6986 }
6987 }
6988
6989 /* write a NUL character at the end */
6990 *out = 0;
6991
6992 /* Extend unicode object */
6993 outsize = out - startout;
6994 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006995 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006997 /* (in - startin) <= size and size is an int */
6998 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006999
7000error:
7001 Py_XDECREF(encoding_obj);
7002 Py_XDECREF(errorHandler);
7003 Py_XDECREF(exc);
7004 return ret;
7005}
7006
Victor Stinner3a50e702011-10-18 21:21:00 +02007007static PyObject *
7008decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007009 const char *s, Py_ssize_t size,
7010 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011{
Victor Stinner76a31a62011-11-04 00:05:13 +01007012 PyObject *v = NULL;
7013 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 if (code_page < 0) {
7016 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7017 return NULL;
7018 }
7019
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022
Victor Stinner76a31a62011-11-04 00:05:13 +01007023 do
7024 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007025#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007026 if (size > INT_MAX) {
7027 chunk_size = INT_MAX;
7028 final = 0;
7029 done = 0;
7030 }
7031 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 {
7034 chunk_size = (int)size;
7035 final = (consumed == NULL);
7036 done = 1;
7037 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038
Victor Stinner76a31a62011-11-04 00:05:13 +01007039 if (chunk_size == 0 && done) {
7040 if (v != NULL)
7041 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007042 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 converted = decode_code_page_strict(code_page, &v,
7046 s, chunk_size);
7047 if (converted == -2)
7048 converted = decode_code_page_errors(code_page, &v,
7049 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007050 errors, final);
7051 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007052
7053 if (converted < 0) {
7054 Py_XDECREF(v);
7055 return NULL;
7056 }
7057
7058 if (consumed)
7059 *consumed += converted;
7060
7061 s += converted;
7062 size -= converted;
7063 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007064
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007065 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066}
7067
Alexander Belopolsky40018472011-02-26 01:02:56 +00007068PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007069PyUnicode_DecodeCodePageStateful(int code_page,
7070 const char *s,
7071 Py_ssize_t size,
7072 const char *errors,
7073 Py_ssize_t *consumed)
7074{
7075 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7076}
7077
7078PyObject *
7079PyUnicode_DecodeMBCSStateful(const char *s,
7080 Py_ssize_t size,
7081 const char *errors,
7082 Py_ssize_t *consumed)
7083{
7084 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7085}
7086
7087PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007088PyUnicode_DecodeMBCS(const char *s,
7089 Py_ssize_t size,
7090 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007091{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7093}
7094
Victor Stinner3a50e702011-10-18 21:21:00 +02007095static DWORD
7096encode_code_page_flags(UINT code_page, const char *errors)
7097{
7098 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007099 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 }
7101 else if (code_page == CP_UTF7) {
7102 /* CP_UTF7 only supports flags=0 */
7103 return 0;
7104 }
7105 else {
7106 if (errors != NULL && strcmp(errors, "replace") == 0)
7107 return 0;
7108 else
7109 return WC_NO_BEST_FIT_CHARS;
7110 }
7111}
7112
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 * Encode a Unicode string to a Windows code page into a byte string in strict
7115 * mode.
7116 *
7117 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007118 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007120static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007121encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007122 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007123 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124{
Victor Stinner554f3f02010-06-16 23:33:54 +00007125 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 BOOL *pusedDefaultChar = &usedDefaultChar;
7127 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007128 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007129 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007130 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 const DWORD flags = encode_code_page_flags(code_page, NULL);
7132 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007133 /* Create a substring so that we can get the UTF-16 representation
7134 of just the slice under consideration. */
7135 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136
Martin v. Löwis3d325192011-11-04 18:23:06 +01007137 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007138
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007140 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007142 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007143
Victor Stinner2fc507f2011-11-04 20:06:39 +01007144 substring = PyUnicode_Substring(unicode, offset, offset+len);
7145 if (substring == NULL)
7146 return -1;
7147 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7148 if (p == NULL) {
7149 Py_DECREF(substring);
7150 return -1;
7151 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007152 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007153
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007154 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007156 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 NULL, 0,
7158 NULL, pusedDefaultChar);
7159 if (outsize <= 0)
7160 goto error;
7161 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007162 if (pusedDefaultChar && *pusedDefaultChar) {
7163 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007165 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007166
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007170 if (*outbytes == NULL) {
7171 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007173 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007175 }
7176 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 const Py_ssize_t n = PyBytes_Size(*outbytes);
7179 if (outsize > PY_SSIZE_T_MAX - n) {
7180 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007181 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007184 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7185 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007187 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189 }
7190
7191 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007193 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 out, outsize,
7195 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 if (outsize <= 0)
7198 goto error;
7199 if (pusedDefaultChar && *pusedDefaultChar)
7200 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007202
Victor Stinner3a50e702011-10-18 21:21:00 +02007203error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007204 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7206 return -2;
7207 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007208 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007209}
7210
Victor Stinner3a50e702011-10-18 21:21:00 +02007211/*
7212 * Encode a Unicode string to a Windows code page into a byte string using a
7213 * error handler.
7214 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007215 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 * -1 on other error.
7217 */
7218static int
7219encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007220 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007221 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007222{
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007224 Py_ssize_t pos = unicode_offset;
7225 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 /* Ideally, we should get reason from FormatMessage. This is the Windows
7227 2000 English version of the message. */
7228 const char *reason = "invalid character";
7229 /* 4=maximum length of a UTF-8 sequence */
7230 char buffer[4];
7231 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7232 Py_ssize_t outsize;
7233 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 PyObject *errorHandler = NULL;
7235 PyObject *exc = NULL;
7236 PyObject *encoding_obj = NULL;
7237 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007238 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 PyObject *rep;
7240 int ret = -1;
7241
7242 assert(insize > 0);
7243
7244 encoding = code_page_name(code_page, &encoding_obj);
7245 if (encoding == NULL)
7246 return -1;
7247
7248 if (errors == NULL || strcmp(errors, "strict") == 0) {
7249 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7250 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007251 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 if (exc != NULL) {
7253 PyCodec_StrictErrors(exc);
7254 Py_DECREF(exc);
7255 }
7256 Py_XDECREF(encoding_obj);
7257 return -1;
7258 }
7259
7260 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7261 pusedDefaultChar = &usedDefaultChar;
7262 else
7263 pusedDefaultChar = NULL;
7264
7265 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7266 PyErr_NoMemory();
7267 goto error;
7268 }
7269 outsize = insize * Py_ARRAY_LENGTH(buffer);
7270
7271 if (*outbytes == NULL) {
7272 /* Create string object */
7273 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7274 if (*outbytes == NULL)
7275 goto error;
7276 out = PyBytes_AS_STRING(*outbytes);
7277 }
7278 else {
7279 /* Extend string object */
7280 Py_ssize_t n = PyBytes_Size(*outbytes);
7281 if (n > PY_SSIZE_T_MAX - outsize) {
7282 PyErr_NoMemory();
7283 goto error;
7284 }
7285 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7286 goto error;
7287 out = PyBytes_AS_STRING(*outbytes) + n;
7288 }
7289
7290 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007291 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007293 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7294 wchar_t chars[2];
7295 int charsize;
7296 if (ch < 0x10000) {
7297 chars[0] = (wchar_t)ch;
7298 charsize = 1;
7299 }
7300 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007301 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7302 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007303 charsize = 2;
7304 }
7305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007307 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 buffer, Py_ARRAY_LENGTH(buffer),
7309 NULL, pusedDefaultChar);
7310 if (outsize > 0) {
7311 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7312 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007313 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 memcpy(out, buffer, outsize);
7315 out += outsize;
7316 continue;
7317 }
7318 }
7319 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7320 PyErr_SetFromWindowsErr(0);
7321 goto error;
7322 }
7323
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 rep = unicode_encode_call_errorhandler(
7325 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007326 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007327 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 if (rep == NULL)
7329 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007330 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007331
7332 if (PyBytes_Check(rep)) {
7333 outsize = PyBytes_GET_SIZE(rep);
7334 if (outsize != 1) {
7335 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7336 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7337 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7338 Py_DECREF(rep);
7339 goto error;
7340 }
7341 out = PyBytes_AS_STRING(*outbytes) + offset;
7342 }
7343 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7344 out += outsize;
7345 }
7346 else {
7347 Py_ssize_t i;
7348 enum PyUnicode_Kind kind;
7349 void *data;
7350
Benjamin Petersonbac79492012-01-14 13:34:47 -05007351 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 Py_DECREF(rep);
7353 goto error;
7354 }
7355
7356 outsize = PyUnicode_GET_LENGTH(rep);
7357 if (outsize != 1) {
7358 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7359 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7360 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7361 Py_DECREF(rep);
7362 goto error;
7363 }
7364 out = PyBytes_AS_STRING(*outbytes) + offset;
7365 }
7366 kind = PyUnicode_KIND(rep);
7367 data = PyUnicode_DATA(rep);
7368 for (i=0; i < outsize; i++) {
7369 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7370 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007371 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007372 encoding, unicode,
7373 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 "unable to encode error handler result to ASCII");
7375 Py_DECREF(rep);
7376 goto error;
7377 }
7378 *out = (unsigned char)ch;
7379 out++;
7380 }
7381 }
7382 Py_DECREF(rep);
7383 }
7384 /* write a NUL byte */
7385 *out = 0;
7386 outsize = out - PyBytes_AS_STRING(*outbytes);
7387 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7388 if (_PyBytes_Resize(outbytes, outsize) < 0)
7389 goto error;
7390 ret = 0;
7391
7392error:
7393 Py_XDECREF(encoding_obj);
7394 Py_XDECREF(errorHandler);
7395 Py_XDECREF(exc);
7396 return ret;
7397}
7398
Victor Stinner3a50e702011-10-18 21:21:00 +02007399static PyObject *
7400encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007401 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 const char *errors)
7403{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007404 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007406 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007407 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007408
Victor Stinner29dacf22015-01-26 16:41:32 +01007409 if (!PyUnicode_Check(unicode)) {
7410 PyErr_BadArgument();
7411 return NULL;
7412 }
7413
Benjamin Petersonbac79492012-01-14 13:34:47 -05007414 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007415 return NULL;
7416 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007417
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 if (code_page < 0) {
7419 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7420 return NULL;
7421 }
7422
Martin v. Löwis3d325192011-11-04 18:23:06 +01007423 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007424 return PyBytes_FromStringAndSize(NULL, 0);
7425
Victor Stinner7581cef2011-11-03 22:32:33 +01007426 offset = 0;
7427 do
7428 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007431 chunks. */
7432 if (len > INT_MAX/2) {
7433 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007434 done = 0;
7435 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007436 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007437#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007438 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007439 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007440 done = 1;
7441 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007442
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007444 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007445 errors);
7446 if (ret == -2)
7447 ret = encode_code_page_errors(code_page, &outbytes,
7448 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007449 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007450 if (ret < 0) {
7451 Py_XDECREF(outbytes);
7452 return NULL;
7453 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007454
Victor Stinner7581cef2011-11-03 22:32:33 +01007455 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007456 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 return outbytes;
7460}
7461
7462PyObject *
7463PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7464 Py_ssize_t size,
7465 const char *errors)
7466{
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 PyObject *unicode, *res;
7468 unicode = PyUnicode_FromUnicode(p, size);
7469 if (unicode == NULL)
7470 return NULL;
7471 res = encode_code_page(CP_ACP, unicode, errors);
7472 Py_DECREF(unicode);
7473 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007474}
7475
7476PyObject *
7477PyUnicode_EncodeCodePage(int code_page,
7478 PyObject *unicode,
7479 const char *errors)
7480{
Victor Stinner7581cef2011-11-03 22:32:33 +01007481 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007482}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007483
Alexander Belopolsky40018472011-02-26 01:02:56 +00007484PyObject *
7485PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007486{
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007488}
7489
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490#undef NEED_RETRY
7491
Victor Stinner99b95382011-07-04 14:23:54 +02007492#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007493
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494/* --- Character Mapping Codec -------------------------------------------- */
7495
Victor Stinnerfb161b12013-04-18 01:44:27 +02007496static int
7497charmap_decode_string(const char *s,
7498 Py_ssize_t size,
7499 PyObject *mapping,
7500 const char *errors,
7501 _PyUnicodeWriter *writer)
7502{
7503 const char *starts = s;
7504 const char *e;
7505 Py_ssize_t startinpos, endinpos;
7506 PyObject *errorHandler = NULL, *exc = NULL;
7507 Py_ssize_t maplen;
7508 enum PyUnicode_Kind mapkind;
7509 void *mapdata;
7510 Py_UCS4 x;
7511 unsigned char ch;
7512
7513 if (PyUnicode_READY(mapping) == -1)
7514 return -1;
7515
7516 maplen = PyUnicode_GET_LENGTH(mapping);
7517 mapdata = PyUnicode_DATA(mapping);
7518 mapkind = PyUnicode_KIND(mapping);
7519
7520 e = s + size;
7521
7522 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7523 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7524 * is disabled in encoding aliases, latin1 is preferred because
7525 * its implementation is faster. */
7526 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7527 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7528 Py_UCS4 maxchar = writer->maxchar;
7529
7530 assert (writer->kind == PyUnicode_1BYTE_KIND);
7531 while (s < e) {
7532 ch = *s;
7533 x = mapdata_ucs1[ch];
7534 if (x > maxchar) {
7535 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7536 goto onError;
7537 maxchar = writer->maxchar;
7538 outdata = (Py_UCS1 *)writer->data;
7539 }
7540 outdata[writer->pos] = x;
7541 writer->pos++;
7542 ++s;
7543 }
7544 return 0;
7545 }
7546
7547 while (s < e) {
7548 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7549 enum PyUnicode_Kind outkind = writer->kind;
7550 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7551 if (outkind == PyUnicode_1BYTE_KIND) {
7552 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7553 Py_UCS4 maxchar = writer->maxchar;
7554 while (s < e) {
7555 ch = *s;
7556 x = mapdata_ucs2[ch];
7557 if (x > maxchar)
7558 goto Error;
7559 outdata[writer->pos] = x;
7560 writer->pos++;
7561 ++s;
7562 }
7563 break;
7564 }
7565 else if (outkind == PyUnicode_2BYTE_KIND) {
7566 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7567 while (s < e) {
7568 ch = *s;
7569 x = mapdata_ucs2[ch];
7570 if (x == 0xFFFE)
7571 goto Error;
7572 outdata[writer->pos] = x;
7573 writer->pos++;
7574 ++s;
7575 }
7576 break;
7577 }
7578 }
7579 ch = *s;
7580
7581 if (ch < maplen)
7582 x = PyUnicode_READ(mapkind, mapdata, ch);
7583 else
7584 x = 0xfffe; /* invalid value */
7585Error:
7586 if (x == 0xfffe)
7587 {
7588 /* undefined mapping */
7589 startinpos = s-starts;
7590 endinpos = startinpos+1;
7591 if (unicode_decode_call_errorhandler_writer(
7592 errors, &errorHandler,
7593 "charmap", "character maps to <undefined>",
7594 &starts, &e, &startinpos, &endinpos, &exc, &s,
7595 writer)) {
7596 goto onError;
7597 }
7598 continue;
7599 }
7600
7601 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7602 goto onError;
7603 ++s;
7604 }
7605 Py_XDECREF(errorHandler);
7606 Py_XDECREF(exc);
7607 return 0;
7608
7609onError:
7610 Py_XDECREF(errorHandler);
7611 Py_XDECREF(exc);
7612 return -1;
7613}
7614
7615static int
7616charmap_decode_mapping(const char *s,
7617 Py_ssize_t size,
7618 PyObject *mapping,
7619 const char *errors,
7620 _PyUnicodeWriter *writer)
7621{
7622 const char *starts = s;
7623 const char *e;
7624 Py_ssize_t startinpos, endinpos;
7625 PyObject *errorHandler = NULL, *exc = NULL;
7626 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007627 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007628
7629 e = s + size;
7630
7631 while (s < e) {
7632 ch = *s;
7633
7634 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7635 key = PyLong_FromLong((long)ch);
7636 if (key == NULL)
7637 goto onError;
7638
7639 item = PyObject_GetItem(mapping, key);
7640 Py_DECREF(key);
7641 if (item == NULL) {
7642 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7643 /* No mapping found means: mapping is undefined. */
7644 PyErr_Clear();
7645 goto Undefined;
7646 } else
7647 goto onError;
7648 }
7649
7650 /* Apply mapping */
7651 if (item == Py_None)
7652 goto Undefined;
7653 if (PyLong_Check(item)) {
7654 long value = PyLong_AS_LONG(item);
7655 if (value == 0xFFFE)
7656 goto Undefined;
7657 if (value < 0 || value > MAX_UNICODE) {
7658 PyErr_Format(PyExc_TypeError,
7659 "character mapping must be in range(0x%lx)",
7660 (unsigned long)MAX_UNICODE + 1);
7661 goto onError;
7662 }
7663
7664 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7665 goto onError;
7666 }
7667 else if (PyUnicode_Check(item)) {
7668 if (PyUnicode_READY(item) == -1)
7669 goto onError;
7670 if (PyUnicode_GET_LENGTH(item) == 1) {
7671 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7672 if (value == 0xFFFE)
7673 goto Undefined;
7674 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7675 goto onError;
7676 }
7677 else {
7678 writer->overallocate = 1;
7679 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7680 goto onError;
7681 }
7682 }
7683 else {
7684 /* wrong return value */
7685 PyErr_SetString(PyExc_TypeError,
7686 "character mapping must return integer, None or str");
7687 goto onError;
7688 }
7689 Py_CLEAR(item);
7690 ++s;
7691 continue;
7692
7693Undefined:
7694 /* undefined mapping */
7695 Py_CLEAR(item);
7696 startinpos = s-starts;
7697 endinpos = startinpos+1;
7698 if (unicode_decode_call_errorhandler_writer(
7699 errors, &errorHandler,
7700 "charmap", "character maps to <undefined>",
7701 &starts, &e, &startinpos, &endinpos, &exc, &s,
7702 writer)) {
7703 goto onError;
7704 }
7705 }
7706 Py_XDECREF(errorHandler);
7707 Py_XDECREF(exc);
7708 return 0;
7709
7710onError:
7711 Py_XDECREF(item);
7712 Py_XDECREF(errorHandler);
7713 Py_XDECREF(exc);
7714 return -1;
7715}
7716
Alexander Belopolsky40018472011-02-26 01:02:56 +00007717PyObject *
7718PyUnicode_DecodeCharmap(const char *s,
7719 Py_ssize_t size,
7720 PyObject *mapping,
7721 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007723 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007724
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 /* Default to Latin-1 */
7726 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007730 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007731 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007732 writer.min_length = size;
7733 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007735
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007736 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007737 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7738 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007739 }
7740 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007741 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007745
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007747 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 return NULL;
7749}
7750
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751/* Charmap encoding: the lookup table */
7752
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 PyObject_HEAD
7755 unsigned char level1[32];
7756 int count2, count3;
7757 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758};
7759
7760static PyObject*
7761encoding_map_size(PyObject *obj, PyObject* args)
7762{
7763 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766}
7767
7768static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 PyDoc_STR("Return the size (in bytes) of this object") },
7771 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007772};
7773
7774static void
7775encoding_map_dealloc(PyObject* o)
7776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007778}
7779
7780static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007781 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 "EncodingMap", /*tp_name*/
7783 sizeof(struct encoding_map), /*tp_basicsize*/
7784 0, /*tp_itemsize*/
7785 /* methods */
7786 encoding_map_dealloc, /*tp_dealloc*/
7787 0, /*tp_print*/
7788 0, /*tp_getattr*/
7789 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007790 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 0, /*tp_repr*/
7792 0, /*tp_as_number*/
7793 0, /*tp_as_sequence*/
7794 0, /*tp_as_mapping*/
7795 0, /*tp_hash*/
7796 0, /*tp_call*/
7797 0, /*tp_str*/
7798 0, /*tp_getattro*/
7799 0, /*tp_setattro*/
7800 0, /*tp_as_buffer*/
7801 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7802 0, /*tp_doc*/
7803 0, /*tp_traverse*/
7804 0, /*tp_clear*/
7805 0, /*tp_richcompare*/
7806 0, /*tp_weaklistoffset*/
7807 0, /*tp_iter*/
7808 0, /*tp_iternext*/
7809 encoding_map_methods, /*tp_methods*/
7810 0, /*tp_members*/
7811 0, /*tp_getset*/
7812 0, /*tp_base*/
7813 0, /*tp_dict*/
7814 0, /*tp_descr_get*/
7815 0, /*tp_descr_set*/
7816 0, /*tp_dictoffset*/
7817 0, /*tp_init*/
7818 0, /*tp_alloc*/
7819 0, /*tp_new*/
7820 0, /*tp_free*/
7821 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822};
7823
7824PyObject*
7825PyUnicode_BuildEncodingMap(PyObject* string)
7826{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827 PyObject *result;
7828 struct encoding_map *mresult;
7829 int i;
7830 int need_dict = 0;
7831 unsigned char level1[32];
7832 unsigned char level2[512];
7833 unsigned char *mlevel1, *mlevel2, *mlevel3;
7834 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 int kind;
7836 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007837 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007839
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007840 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 PyErr_BadArgument();
7842 return NULL;
7843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 kind = PyUnicode_KIND(string);
7845 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007846 length = PyUnicode_GET_LENGTH(string);
7847 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 memset(level1, 0xFF, sizeof level1);
7849 memset(level2, 0xFF, sizeof level2);
7850
7851 /* If there isn't a one-to-one mapping of NULL to \0,
7852 or if there are non-BMP characters, we need to use
7853 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007854 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007855 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007856 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858 ch = PyUnicode_READ(kind, data, i);
7859 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007860 need_dict = 1;
7861 break;
7862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007864 /* unmapped character */
7865 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 l1 = ch >> 11;
7867 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007868 if (level1[l1] == 0xFF)
7869 level1[l1] = count2++;
7870 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007871 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007872 }
7873
7874 if (count2 >= 0xFF || count3 >= 0xFF)
7875 need_dict = 1;
7876
7877 if (need_dict) {
7878 PyObject *result = PyDict_New();
7879 PyObject *key, *value;
7880 if (!result)
7881 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007882 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007884 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 if (!key || !value)
7886 goto failed1;
7887 if (PyDict_SetItem(result, key, value) == -1)
7888 goto failed1;
7889 Py_DECREF(key);
7890 Py_DECREF(value);
7891 }
7892 return result;
7893 failed1:
7894 Py_XDECREF(key);
7895 Py_XDECREF(value);
7896 Py_DECREF(result);
7897 return NULL;
7898 }
7899
7900 /* Create a three-level trie */
7901 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7902 16*count2 + 128*count3 - 1);
7903 if (!result)
7904 return PyErr_NoMemory();
7905 PyObject_Init(result, &EncodingMapType);
7906 mresult = (struct encoding_map*)result;
7907 mresult->count2 = count2;
7908 mresult->count3 = count3;
7909 mlevel1 = mresult->level1;
7910 mlevel2 = mresult->level23;
7911 mlevel3 = mresult->level23 + 16*count2;
7912 memcpy(mlevel1, level1, 32);
7913 memset(mlevel2, 0xFF, 16*count2);
7914 memset(mlevel3, 0, 128*count3);
7915 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007916 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007918 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7919 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920 /* unmapped character */
7921 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007922 o1 = ch>>11;
7923 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924 i2 = 16*mlevel1[o1] + o2;
7925 if (mlevel2[i2] == 0xFF)
7926 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007927 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007928 i3 = 128*mlevel2[i2] + o3;
7929 mlevel3[i3] = i;
7930 }
7931 return result;
7932}
7933
7934static int
Victor Stinner22168992011-11-20 17:09:18 +01007935encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936{
7937 struct encoding_map *map = (struct encoding_map*)mapping;
7938 int l1 = c>>11;
7939 int l2 = (c>>7) & 0xF;
7940 int l3 = c & 0x7F;
7941 int i;
7942
Victor Stinner22168992011-11-20 17:09:18 +01007943 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945 if (c == 0)
7946 return 0;
7947 /* level 1*/
7948 i = map->level1[l1];
7949 if (i == 0xFF) {
7950 return -1;
7951 }
7952 /* level 2*/
7953 i = map->level23[16*i+l2];
7954 if (i == 0xFF) {
7955 return -1;
7956 }
7957 /* level 3 */
7958 i = map->level23[16*map->count2 + 128*i + l3];
7959 if (i == 0) {
7960 return -1;
7961 }
7962 return i;
7963}
7964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007965/* Lookup the character ch in the mapping. If the character
7966 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007967 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007968static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007969charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970{
Christian Heimes217cfd12007-12-02 14:31:20 +00007971 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 PyObject *x;
7973
7974 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 x = PyObject_GetItem(mapping, w);
7977 Py_DECREF(w);
7978 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7980 /* No mapping found means: mapping is undefined. */
7981 PyErr_Clear();
7982 x = Py_None;
7983 Py_INCREF(x);
7984 return x;
7985 } else
7986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007988 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007990 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 long value = PyLong_AS_LONG(x);
7992 if (value < 0 || value > 255) {
7993 PyErr_SetString(PyExc_TypeError,
7994 "character mapping must be in range(256)");
7995 Py_DECREF(x);
7996 return NULL;
7997 }
7998 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008000 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 /* wrong return value */
8004 PyErr_Format(PyExc_TypeError,
8005 "character mapping must return integer, bytes or None, not %.400s",
8006 x->ob_type->tp_name);
8007 Py_DECREF(x);
8008 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 }
8010}
8011
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008012static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008013charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008014{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8016 /* exponentially overallocate to minimize reallocations */
8017 if (requiredsize < 2*outsize)
8018 requiredsize = 2*outsize;
8019 if (_PyBytes_Resize(outobj, requiredsize))
8020 return -1;
8021 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008022}
8023
Benjamin Peterson14339b62009-01-31 16:36:08 +00008024typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008026} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008027/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008028 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008029 space is available. Return a new reference to the object that
8030 was put in the output buffer, or Py_None, if the mapping was undefined
8031 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008032 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008033static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008034charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008035 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037 PyObject *rep;
8038 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008039 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040
Christian Heimes90aa7642007-12-19 02:45:37 +00008041 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008042 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044 if (res == -1)
8045 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 if (outsize<requiredsize)
8047 if (charmapencode_resize(outobj, outpos, requiredsize))
8048 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008049 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 outstart[(*outpos)++] = (char)res;
8051 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052 }
8053
8054 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008055 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 Py_DECREF(rep);
8059 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 if (PyLong_Check(rep)) {
8062 Py_ssize_t requiredsize = *outpos+1;
8063 if (outsize<requiredsize)
8064 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8065 Py_DECREF(rep);
8066 return enc_EXCEPTION;
8067 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008068 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008070 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 else {
8072 const char *repchars = PyBytes_AS_STRING(rep);
8073 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8074 Py_ssize_t requiredsize = *outpos+repsize;
8075 if (outsize<requiredsize)
8076 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8077 Py_DECREF(rep);
8078 return enc_EXCEPTION;
8079 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008080 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 memcpy(outstart + *outpos, repchars, repsize);
8082 *outpos += repsize;
8083 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008085 Py_DECREF(rep);
8086 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087}
8088
8089/* handle an error in PyUnicode_EncodeCharmap
8090 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008091static int
8092charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008095 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008096 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097{
8098 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008099 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008101 enum PyUnicode_Kind kind;
8102 void *data;
8103 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008105 Py_ssize_t collstartpos = *inpos;
8106 Py_ssize_t collendpos = *inpos+1;
8107 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 char *encoding = "charmap";
8109 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008112 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113
Benjamin Petersonbac79492012-01-14 13:34:47 -05008114 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008115 return -1;
8116 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 /* find all unencodable characters */
8118 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008120 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008121 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008122 val = encoding_map_lookup(ch, mapping);
8123 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 break;
8125 ++collendpos;
8126 continue;
8127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008129 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8130 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if (rep==NULL)
8132 return -1;
8133 else if (rep!=Py_None) {
8134 Py_DECREF(rep);
8135 break;
8136 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008139 }
8140 /* cache callback name lookup
8141 * (if not done yet, i.e. it's the first error) */
8142 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 if ((errors==NULL) || (!strcmp(errors, "strict")))
8144 *known_errorHandler = 1;
8145 else if (!strcmp(errors, "replace"))
8146 *known_errorHandler = 2;
8147 else if (!strcmp(errors, "ignore"))
8148 *known_errorHandler = 3;
8149 else if (!strcmp(errors, "xmlcharrefreplace"))
8150 *known_errorHandler = 4;
8151 else
8152 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153 }
8154 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008155 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008156 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008157 return -1;
8158 case 2: /* replace */
8159 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 x = charmapencode_output('?', mapping, res, respos);
8161 if (x==enc_EXCEPTION) {
8162 return -1;
8163 }
8164 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008165 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return -1;
8167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 }
8169 /* fall through */
8170 case 3: /* ignore */
8171 *inpos = collendpos;
8172 break;
8173 case 4: /* xmlcharrefreplace */
8174 /* generate replacement (temporarily (mis)uses p) */
8175 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 char buffer[2+29+1+1];
8177 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008178 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 for (cp = buffer; *cp; ++cp) {
8180 x = charmapencode_output(*cp, mapping, res, respos);
8181 if (x==enc_EXCEPTION)
8182 return -1;
8183 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008184 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 return -1;
8186 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 }
8188 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 *inpos = collendpos;
8190 break;
8191 default:
8192 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008193 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008197 if (PyBytes_Check(repunicode)) {
8198 /* Directly copy bytes result to output. */
8199 Py_ssize_t outsize = PyBytes_Size(*res);
8200 Py_ssize_t requiredsize;
8201 repsize = PyBytes_Size(repunicode);
8202 requiredsize = *respos + repsize;
8203 if (requiredsize > outsize)
8204 /* Make room for all additional bytes. */
8205 if (charmapencode_resize(res, respos, requiredsize)) {
8206 Py_DECREF(repunicode);
8207 return -1;
8208 }
8209 memcpy(PyBytes_AsString(*res) + *respos,
8210 PyBytes_AsString(repunicode), repsize);
8211 *respos += repsize;
8212 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008213 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008214 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008215 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008216 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008217 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008218 Py_DECREF(repunicode);
8219 return -1;
8220 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008221 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008222 data = PyUnicode_DATA(repunicode);
8223 kind = PyUnicode_KIND(repunicode);
8224 for (index = 0; index < repsize; index++) {
8225 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8226 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008228 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 return -1;
8230 }
8231 else if (x==enc_FAILED) {
8232 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008233 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 return -1;
8235 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008236 }
8237 *inpos = newpos;
8238 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 }
8240 return 0;
8241}
8242
Alexander Belopolsky40018472011-02-26 01:02:56 +00008243PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244_PyUnicode_EncodeCharmap(PyObject *unicode,
8245 PyObject *mapping,
8246 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 /* output object */
8249 PyObject *res = NULL;
8250 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008251 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008252 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008254 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255 PyObject *errorHandler = NULL;
8256 PyObject *exc = NULL;
8257 /* the following variable is used for caching string comparisons
8258 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8259 * 3=ignore, 4=xmlcharrefreplace */
8260 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008261 void *data;
8262 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
Benjamin Petersonbac79492012-01-14 13:34:47 -05008264 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008265 return NULL;
8266 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008267 data = PyUnicode_DATA(unicode);
8268 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008269
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 /* Default to Latin-1 */
8271 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008272 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 /* allocate enough for a simple encoding without
8275 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008276 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 if (res == NULL)
8278 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008279 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008283 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008285 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 if (x==enc_EXCEPTION) /* error */
8287 goto onError;
8288 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008289 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 &exc,
8291 &known_errorHandler, &errorHandler, errors,
8292 &res, &respos)) {
8293 goto onError;
8294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008295 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 else
8297 /* done with this character => adjust input position */
8298 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008302 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008303 if (_PyBytes_Resize(&res, respos) < 0)
8304 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 Py_XDECREF(exc);
8307 Py_XDECREF(errorHandler);
8308 return res;
8309
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311 Py_XDECREF(res);
8312 Py_XDECREF(exc);
8313 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 return NULL;
8315}
8316
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008317/* Deprecated */
8318PyObject *
8319PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8320 Py_ssize_t size,
8321 PyObject *mapping,
8322 const char *errors)
8323{
8324 PyObject *result;
8325 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8326 if (unicode == NULL)
8327 return NULL;
8328 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8329 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008330 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331}
8332
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333PyObject *
8334PyUnicode_AsCharmapString(PyObject *unicode,
8335 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336{
8337 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 PyErr_BadArgument();
8339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008341 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342}
8343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008345static void
8346make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348 Py_ssize_t startpos, Py_ssize_t endpos,
8349 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 *exceptionObject = _PyUnicodeTranslateError_Create(
8353 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8357 goto onError;
8358 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8359 goto onError;
8360 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8361 goto onError;
8362 return;
8363 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008364 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 }
8366}
8367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368/* error handling callback helper:
8369 build arguments, call the callback and check the arguments,
8370 put the result into newpos and return the replacement string, which
8371 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008372static PyObject *
8373unicode_translate_call_errorhandler(const char *errors,
8374 PyObject **errorHandler,
8375 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008377 Py_ssize_t startpos, Py_ssize_t endpos,
8378 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008380 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008382 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383 PyObject *restuple;
8384 PyObject *resunicode;
8385
8386 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 }
8391
8392 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396
8397 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008402 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 Py_DECREF(restuple);
8404 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 }
8406 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 &resunicode, &i_newpos)) {
8408 Py_DECREF(restuple);
8409 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008411 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008413 else
8414 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008416 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 Py_DECREF(restuple);
8418 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008419 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 Py_INCREF(resunicode);
8421 Py_DECREF(restuple);
8422 return resunicode;
8423}
8424
8425/* Lookup the character ch in the mapping and put the result in result,
8426 which must be decrefed by the caller.
8427 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008428static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430{
Christian Heimes217cfd12007-12-02 14:31:20 +00008431 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 PyObject *x;
8433
8434 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 x = PyObject_GetItem(mapping, w);
8437 Py_DECREF(w);
8438 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8440 /* No mapping found means: use 1:1 mapping. */
8441 PyErr_Clear();
8442 *result = NULL;
8443 return 0;
8444 } else
8445 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 }
8447 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 *result = x;
8449 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008451 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008453 if (value < 0 || value > MAX_UNICODE) {
8454 PyErr_Format(PyExc_ValueError,
8455 "character mapping must be in range(0x%x)",
8456 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 Py_DECREF(x);
8458 return -1;
8459 }
8460 *result = x;
8461 return 0;
8462 }
8463 else if (PyUnicode_Check(x)) {
8464 *result = x;
8465 return 0;
8466 }
8467 else {
8468 /* wrong return value */
8469 PyErr_SetString(PyExc_TypeError,
8470 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 Py_DECREF(x);
8472 return -1;
8473 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474}
Victor Stinner1194ea02014-04-04 19:37:40 +02008475
8476/* lookup the character, write the result into the writer.
8477 Return 1 if the result was written into the writer, return 0 if the mapping
8478 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008479static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008480charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8481 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482{
Victor Stinner1194ea02014-04-04 19:37:40 +02008483 PyObject *item;
8484
8485 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008487
8488 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008490 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008493 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008494 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008495
8496 if (item == Py_None) {
8497 Py_DECREF(item);
8498 return 0;
8499 }
8500
8501 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008502 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8503 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8504 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008505 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8506 Py_DECREF(item);
8507 return -1;
8508 }
8509 Py_DECREF(item);
8510 return 1;
8511 }
8512
8513 if (!PyUnicode_Check(item)) {
8514 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008516 }
8517
8518 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8519 Py_DECREF(item);
8520 return -1;
8521 }
8522
8523 Py_DECREF(item);
8524 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525}
8526
Victor Stinner89a76ab2014-04-05 11:44:04 +02008527static int
8528unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8529 Py_UCS1 *translate)
8530{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008531 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008532 int ret = 0;
8533
Victor Stinner89a76ab2014-04-05 11:44:04 +02008534 if (charmaptranslate_lookup(ch, mapping, &item)) {
8535 return -1;
8536 }
8537
8538 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008539 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008540 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008541 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008542 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008543 /* not found => default to 1:1 mapping */
8544 translate[ch] = ch;
8545 return 1;
8546 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008547 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008548 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008549 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8550 used it */
8551 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008552 /* invalid character or character outside ASCII:
8553 skip the fast translate */
8554 goto exit;
8555 }
8556 translate[ch] = (Py_UCS1)replace;
8557 }
8558 else if (PyUnicode_Check(item)) {
8559 Py_UCS4 replace;
8560
8561 if (PyUnicode_READY(item) == -1) {
8562 Py_DECREF(item);
8563 return -1;
8564 }
8565 if (PyUnicode_GET_LENGTH(item) != 1)
8566 goto exit;
8567
8568 replace = PyUnicode_READ_CHAR(item, 0);
8569 if (replace > 127)
8570 goto exit;
8571 translate[ch] = (Py_UCS1)replace;
8572 }
8573 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008574 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008575 goto exit;
8576 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008577 ret = 1;
8578
Benjamin Peterson1365de72014-04-07 20:15:41 -04008579 exit:
8580 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008581 return ret;
8582}
8583
8584/* Fast path for ascii => ascii translation. Return 1 if the whole string
8585 was translated into writer, return 0 if the input string was partially
8586 translated into writer, raise an exception and return -1 on error. */
8587static int
8588unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008589 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008590{
Victor Stinner872b2912014-04-05 14:27:07 +02008591 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008592 Py_ssize_t len;
8593 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008594 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008595
8596 if (PyUnicode_READY(input) == -1)
8597 return -1;
8598 if (!PyUnicode_IS_ASCII(input))
8599 return 0;
8600 len = PyUnicode_GET_LENGTH(input);
8601
Victor Stinner872b2912014-04-05 14:27:07 +02008602 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008603
8604 in = PyUnicode_1BYTE_DATA(input);
8605 end = in + len;
8606
8607 assert(PyUnicode_IS_ASCII(writer->buffer));
8608 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8609 out = PyUnicode_1BYTE_DATA(writer->buffer);
8610
Victor Stinner872b2912014-04-05 14:27:07 +02008611 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008612 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008613 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008614 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008615 int translate = unicode_fast_translate_lookup(mapping, ch,
8616 ascii_table);
8617 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008618 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008619 if (translate == 0)
8620 goto exit;
8621 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008622 }
Victor Stinner872b2912014-04-05 14:27:07 +02008623 if (ch2 == 0xfe) {
8624 if (ignore)
8625 continue;
8626 goto exit;
8627 }
8628 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008629 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008630 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008631 }
Victor Stinner872b2912014-04-05 14:27:07 +02008632 res = 1;
8633
8634exit:
8635 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8636 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008637}
8638
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640_PyUnicode_TranslateCharmap(PyObject *input,
8641 PyObject *mapping,
8642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008645 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 Py_ssize_t size, i;
8647 int kind;
8648 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008649 _PyUnicodeWriter writer;
8650 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651 char *reason = "character maps to <undefined>";
8652 PyObject *errorHandler = NULL;
8653 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008654 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008655 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 PyErr_BadArgument();
8659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 if (PyUnicode_READY(input) == -1)
8663 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008664 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 kind = PyUnicode_KIND(input);
8666 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667
8668 if (size == 0) {
8669 Py_INCREF(input);
8670 return input;
8671 }
8672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 /* allocate enough for a simple 1:1 translation without
8674 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008675 _PyUnicodeWriter_Init(&writer);
8676 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678
Victor Stinner872b2912014-04-05 14:27:07 +02008679 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8680
8681 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008682 if (res < 0) {
8683 _PyUnicodeWriter_Dealloc(&writer);
8684 return NULL;
8685 }
8686 if (res == 1)
8687 return _PyUnicodeWriter_Finish(&writer);
8688
Victor Stinner89a76ab2014-04-05 11:44:04 +02008689 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008692 int translate;
8693 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8694 Py_ssize_t newpos;
8695 /* startpos for collecting untranslatable chars */
8696 Py_ssize_t collstart;
8697 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008698 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699
Victor Stinner1194ea02014-04-04 19:37:40 +02008700 ch = PyUnicode_READ(kind, data, i);
8701 translate = charmaptranslate_output(ch, mapping, &writer);
8702 if (translate < 0)
8703 goto onError;
8704
8705 if (translate != 0) {
8706 /* it worked => adjust input pointer */
8707 ++i;
8708 continue;
8709 }
8710
8711 /* untranslatable character */
8712 collstart = i;
8713 collend = i+1;
8714
8715 /* find all untranslatable characters */
8716 while (collend < size) {
8717 PyObject *x;
8718 ch = PyUnicode_READ(kind, data, collend);
8719 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008720 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008721 Py_XDECREF(x);
8722 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008724 ++collend;
8725 }
8726
8727 if (ignore) {
8728 i = collend;
8729 }
8730 else {
8731 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8732 reason, input, &exc,
8733 collstart, collend, &newpos);
8734 if (repunicode == NULL)
8735 goto onError;
8736 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008738 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008739 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008740 Py_DECREF(repunicode);
8741 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008742 }
8743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744 Py_XDECREF(exc);
8745 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008746 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008749 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 Py_XDECREF(exc);
8751 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 return NULL;
8753}
8754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755/* Deprecated. Use PyUnicode_Translate instead. */
8756PyObject *
8757PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8758 Py_ssize_t size,
8759 PyObject *mapping,
8760 const char *errors)
8761{
Christian Heimes5f520f42012-09-11 14:03:25 +02008762 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8764 if (!unicode)
8765 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008766 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8767 Py_DECREF(unicode);
8768 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769}
8770
Alexander Belopolsky40018472011-02-26 01:02:56 +00008771PyObject *
8772PyUnicode_Translate(PyObject *str,
8773 PyObject *mapping,
8774 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775{
8776 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008777
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 str = PyUnicode_FromObject(str);
8779 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008780 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 Py_DECREF(str);
8783 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784}
Tim Petersced69f82003-09-16 20:30:58 +00008785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008787fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788{
8789 /* No need to call PyUnicode_READY(self) because this function is only
8790 called as a callback from fixup() which does it already. */
8791 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8792 const int kind = PyUnicode_KIND(self);
8793 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008794 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008795 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 Py_ssize_t i;
8797
8798 for (i = 0; i < len; ++i) {
8799 ch = PyUnicode_READ(kind, data, i);
8800 fixed = 0;
8801 if (ch > 127) {
8802 if (Py_UNICODE_ISSPACE(ch))
8803 fixed = ' ';
8804 else {
8805 const int decimal = Py_UNICODE_TODECIMAL(ch);
8806 if (decimal >= 0)
8807 fixed = '0' + decimal;
8808 }
8809 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008810 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008811 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 PyUnicode_WRITE(kind, data, i, fixed);
8813 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008814 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008815 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 }
8818
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008819 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820}
8821
8822PyObject *
8823_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8824{
8825 if (!PyUnicode_Check(unicode)) {
8826 PyErr_BadInternalCall();
8827 return NULL;
8828 }
8829 if (PyUnicode_READY(unicode) == -1)
8830 return NULL;
8831 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8832 /* If the string is already ASCII, just return the same string */
8833 Py_INCREF(unicode);
8834 return unicode;
8835 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008836 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837}
8838
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008839PyObject *
8840PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8841 Py_ssize_t length)
8842{
Victor Stinnerf0124502011-11-21 23:12:56 +01008843 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008844 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008845 Py_UCS4 maxchar;
8846 enum PyUnicode_Kind kind;
8847 void *data;
8848
Victor Stinner99d7ad02012-02-22 13:37:39 +01008849 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008850 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008851 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008852 if (ch > 127) {
8853 int decimal = Py_UNICODE_TODECIMAL(ch);
8854 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008855 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008856 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008857 }
8858 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008859
8860 /* Copy to a new string */
8861 decimal = PyUnicode_New(length, maxchar);
8862 if (decimal == NULL)
8863 return decimal;
8864 kind = PyUnicode_KIND(decimal);
8865 data = PyUnicode_DATA(decimal);
8866 /* Iterate over code points */
8867 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008868 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008869 if (ch > 127) {
8870 int decimal = Py_UNICODE_TODECIMAL(ch);
8871 if (decimal >= 0)
8872 ch = '0' + decimal;
8873 }
8874 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008876 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008877}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008878/* --- Decimal Encoder ---------------------------------------------------- */
8879
Alexander Belopolsky40018472011-02-26 01:02:56 +00008880int
8881PyUnicode_EncodeDecimal(Py_UNICODE *s,
8882 Py_ssize_t length,
8883 char *output,
8884 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008885{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008886 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008887 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008888 enum PyUnicode_Kind kind;
8889 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008890
8891 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 PyErr_BadArgument();
8893 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008894 }
8895
Victor Stinner42bf7752011-11-21 22:52:58 +01008896 unicode = PyUnicode_FromUnicode(s, length);
8897 if (unicode == NULL)
8898 return -1;
8899
Benjamin Petersonbac79492012-01-14 13:34:47 -05008900 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008901 Py_DECREF(unicode);
8902 return -1;
8903 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008904 kind = PyUnicode_KIND(unicode);
8905 data = PyUnicode_DATA(unicode);
8906
Victor Stinnerb84d7232011-11-22 01:50:07 +01008907 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008908 PyObject *exc;
8909 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008911 Py_ssize_t startpos;
8912
8913 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008914
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008916 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008917 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 decimal = Py_UNICODE_TODECIMAL(ch);
8921 if (decimal >= 0) {
8922 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008923 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 continue;
8925 }
8926 if (0 < ch && ch < 256) {
8927 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008928 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 continue;
8930 }
Victor Stinner6345be92011-11-25 20:09:01 +01008931
Victor Stinner42bf7752011-11-21 22:52:58 +01008932 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008933 exc = NULL;
8934 raise_encode_exception(&exc, "decimal", unicode,
8935 startpos, startpos+1,
8936 "invalid decimal Unicode string");
8937 Py_XDECREF(exc);
8938 Py_DECREF(unicode);
8939 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008940 }
8941 /* 0-terminate the output string */
8942 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008943 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008944 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008945}
8946
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947/* --- Helpers ------------------------------------------------------------ */
8948
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008949/* helper macro to fixup start/end slice values */
8950#define ADJUST_INDICES(start, end, len) \
8951 if (end > len) \
8952 end = len; \
8953 else if (end < 0) { \
8954 end += len; \
8955 if (end < 0) \
8956 end = 0; \
8957 } \
8958 if (start < 0) { \
8959 start += len; \
8960 if (start < 0) \
8961 start = 0; \
8962 }
8963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008965any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 Py_ssize_t start,
8967 Py_ssize_t end)
8968{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008969 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 void *buf1, *buf2;
8971 Py_ssize_t len1, len2, result;
8972
8973 kind1 = PyUnicode_KIND(s1);
8974 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008975 if (kind1 < kind2)
8976 return -1;
8977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 len1 = PyUnicode_GET_LENGTH(s1);
8979 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008980 ADJUST_INDICES(start, end, len1);
8981 if (end - start < len2)
8982 return -1;
8983
8984 buf1 = PyUnicode_DATA(s1);
8985 buf2 = PyUnicode_DATA(s2);
8986 if (len2 == 1) {
8987 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8988 result = findchar((const char *)buf1 + kind1*start,
8989 kind1, end - start, ch, direction);
8990 if (result == -1)
8991 return -1;
8992 else
8993 return start + result;
8994 }
8995
8996 if (kind2 != kind1) {
8997 buf2 = _PyUnicode_AsKind(s2, kind1);
8998 if (!buf2)
8999 return -2;
9000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001
Victor Stinner794d5672011-10-10 03:21:36 +02009002 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009003 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009004 case PyUnicode_1BYTE_KIND:
9005 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9006 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9007 else
9008 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9009 break;
9010 case PyUnicode_2BYTE_KIND:
9011 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9012 break;
9013 case PyUnicode_4BYTE_KIND:
9014 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9015 break;
9016 default:
9017 assert(0); result = -2;
9018 }
9019 }
9020 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009021 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009022 case PyUnicode_1BYTE_KIND:
9023 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9024 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9025 else
9026 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9027 break;
9028 case PyUnicode_2BYTE_KIND:
9029 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9030 break;
9031 case PyUnicode_4BYTE_KIND:
9032 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9033 break;
9034 default:
9035 assert(0); result = -2;
9036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037 }
9038
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009039 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 PyMem_Free(buf2);
9041
9042 return result;
9043}
9044
9045Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009046_PyUnicode_InsertThousandsGrouping(
9047 PyObject *unicode, Py_ssize_t index,
9048 Py_ssize_t n_buffer,
9049 void *digits, Py_ssize_t n_digits,
9050 Py_ssize_t min_width,
9051 const char *grouping, PyObject *thousands_sep,
9052 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053{
Victor Stinner41a863c2012-02-24 00:37:51 +01009054 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009055 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009056 Py_ssize_t thousands_sep_len;
9057 Py_ssize_t len;
9058
9059 if (unicode != NULL) {
9060 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009061 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009062 }
9063 else {
9064 kind = PyUnicode_1BYTE_KIND;
9065 data = NULL;
9066 }
9067 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9068 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9069 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9070 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009071 if (thousands_sep_kind < kind) {
9072 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9073 if (!thousands_sep_data)
9074 return -1;
9075 }
9076 else {
9077 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9078 if (!data)
9079 return -1;
9080 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009081 }
9082
Benjamin Petersonead6b532011-12-20 17:23:42 -06009083 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009085 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009086 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009087 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009088 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009089 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009090 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009091 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009092 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009093 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009094 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009095 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009097 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009098 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009100 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009101 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009103 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009104 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009105 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009106 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009107 break;
9108 default:
9109 assert(0);
9110 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009112 if (unicode != NULL && thousands_sep_kind != kind) {
9113 if (thousands_sep_kind < kind)
9114 PyMem_Free(thousands_sep_data);
9115 else
9116 PyMem_Free(data);
9117 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009118 if (unicode == NULL) {
9119 *maxchar = 127;
9120 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009121 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009122 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009123 }
9124 }
9125 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126}
9127
9128
Alexander Belopolsky40018472011-02-26 01:02:56 +00009129Py_ssize_t
9130PyUnicode_Count(PyObject *str,
9131 PyObject *substr,
9132 Py_ssize_t start,
9133 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009135 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009136 PyObject* str_obj;
9137 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009138 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 void *buf1 = NULL, *buf2 = NULL;
9140 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009141
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009142 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009143 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009145 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009146 if (!sub_obj) {
9147 Py_DECREF(str_obj);
9148 return -1;
9149 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009150 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009151 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 Py_DECREF(str_obj);
9153 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 }
Tim Petersced69f82003-09-16 20:30:58 +00009155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 kind1 = PyUnicode_KIND(str_obj);
9157 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009158 if (kind1 < kind2) {
9159 Py_DECREF(sub_obj);
9160 Py_DECREF(str_obj);
9161 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009162 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 len1 = PyUnicode_GET_LENGTH(str_obj);
9165 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009167 if (end - start < len2) {
9168 Py_DECREF(sub_obj);
9169 Py_DECREF(str_obj);
9170 return 0;
9171 }
9172
9173 buf1 = PyUnicode_DATA(str_obj);
9174 buf2 = PyUnicode_DATA(sub_obj);
9175 if (kind2 != kind1) {
9176 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9177 if (!buf2)
9178 goto onError;
9179 }
9180
9181 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009183 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9184 result = asciilib_count(
9185 ((Py_UCS1*)buf1) + start, end - start,
9186 buf2, len2, PY_SSIZE_T_MAX
9187 );
9188 else
9189 result = ucs1lib_count(
9190 ((Py_UCS1*)buf1) + start, end - start,
9191 buf2, len2, PY_SSIZE_T_MAX
9192 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 break;
9194 case PyUnicode_2BYTE_KIND:
9195 result = ucs2lib_count(
9196 ((Py_UCS2*)buf1) + start, end - start,
9197 buf2, len2, PY_SSIZE_T_MAX
9198 );
9199 break;
9200 case PyUnicode_4BYTE_KIND:
9201 result = ucs4lib_count(
9202 ((Py_UCS4*)buf1) + start, end - start,
9203 buf2, len2, PY_SSIZE_T_MAX
9204 );
9205 break;
9206 default:
9207 assert(0); result = 0;
9208 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009209
9210 Py_DECREF(sub_obj);
9211 Py_DECREF(str_obj);
9212
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009213 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 PyMem_Free(buf2);
9215
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 onError:
9218 Py_DECREF(sub_obj);
9219 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009220 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 PyMem_Free(buf2);
9222 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223}
9224
Alexander Belopolsky40018472011-02-26 01:02:56 +00009225Py_ssize_t
9226PyUnicode_Find(PyObject *str,
9227 PyObject *sub,
9228 Py_ssize_t start,
9229 Py_ssize_t end,
9230 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009232 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009233
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009235 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009237 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009238 if (!sub) {
9239 Py_DECREF(str);
9240 return -2;
9241 }
9242 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9243 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 Py_DECREF(str);
9245 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 }
Tim Petersced69f82003-09-16 20:30:58 +00009247
Victor Stinner794d5672011-10-10 03:21:36 +02009248 result = any_find_slice(direction,
9249 str, sub, start, end
9250 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009251
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009253 Py_DECREF(sub);
9254
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 return result;
9256}
9257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258Py_ssize_t
9259PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9260 Py_ssize_t start, Py_ssize_t end,
9261 int direction)
9262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009264 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 if (PyUnicode_READY(str) == -1)
9266 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009267 if (start < 0 || end < 0) {
9268 PyErr_SetString(PyExc_IndexError, "string index out of range");
9269 return -2;
9270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 if (end > PyUnicode_GET_LENGTH(str))
9272 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009273 if (start >= end)
9274 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009276 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9277 kind, end-start, ch, direction);
9278 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009280 else
9281 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282}
9283
Alexander Belopolsky40018472011-02-26 01:02:56 +00009284static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009285tailmatch(PyObject *self,
9286 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009287 Py_ssize_t start,
9288 Py_ssize_t end,
9289 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 int kind_self;
9292 int kind_sub;
9293 void *data_self;
9294 void *data_sub;
9295 Py_ssize_t offset;
9296 Py_ssize_t i;
9297 Py_ssize_t end_sub;
9298
9299 if (PyUnicode_READY(self) == -1 ||
9300 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009301 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302
9303 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 return 1;
9305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9307 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009309 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 kind_self = PyUnicode_KIND(self);
9312 data_self = PyUnicode_DATA(self);
9313 kind_sub = PyUnicode_KIND(substring);
9314 data_sub = PyUnicode_DATA(substring);
9315 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9316
9317 if (direction > 0)
9318 offset = end;
9319 else
9320 offset = start;
9321
9322 if (PyUnicode_READ(kind_self, data_self, offset) ==
9323 PyUnicode_READ(kind_sub, data_sub, 0) &&
9324 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9325 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9326 /* If both are of the same kind, memcmp is sufficient */
9327 if (kind_self == kind_sub) {
9328 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009329 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 data_sub,
9331 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009332 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 }
9334 /* otherwise we have to compare each character by first accesing it */
9335 else {
9336 /* We do not need to compare 0 and len(substring)-1 because
9337 the if statement above ensured already that they are equal
9338 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 for (i = 1; i < end_sub; ++i) {
9340 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9341 PyUnicode_READ(kind_sub, data_sub, i))
9342 return 0;
9343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 }
9347
9348 return 0;
9349}
9350
Alexander Belopolsky40018472011-02-26 01:02:56 +00009351Py_ssize_t
9352PyUnicode_Tailmatch(PyObject *str,
9353 PyObject *substr,
9354 Py_ssize_t start,
9355 Py_ssize_t end,
9356 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009358 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009359
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 str = PyUnicode_FromObject(str);
9361 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 substr = PyUnicode_FromObject(substr);
9364 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 Py_DECREF(str);
9366 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 }
Tim Petersced69f82003-09-16 20:30:58 +00009368
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009369 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 Py_DECREF(str);
9372 Py_DECREF(substr);
9373 return result;
9374}
9375
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376/* Apply fixfct filter to the Unicode object self and return a
9377 reference to the modified object */
9378
Alexander Belopolsky40018472011-02-26 01:02:56 +00009379static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009380fixup(PyObject *self,
9381 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 PyObject *u;
9384 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009385 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009387 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009390 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 /* fix functions return the new maximum character in a string,
9393 if the kind of the resulting unicode object does not change,
9394 everything is fine. Otherwise we need to change the string kind
9395 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009396 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009397
9398 if (maxchar_new == 0) {
9399 /* no changes */;
9400 if (PyUnicode_CheckExact(self)) {
9401 Py_DECREF(u);
9402 Py_INCREF(self);
9403 return self;
9404 }
9405 else
9406 return u;
9407 }
9408
Victor Stinnere6abb482012-05-02 01:15:40 +02009409 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410
Victor Stinnereaab6042011-12-11 22:22:39 +01009411 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009413
9414 /* In case the maximum character changed, we need to
9415 convert the string to the new category. */
9416 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9417 if (v == NULL) {
9418 Py_DECREF(u);
9419 return NULL;
9420 }
9421 if (maxchar_new > maxchar_old) {
9422 /* If the maxchar increased so that the kind changed, not all
9423 characters are representable anymore and we need to fix the
9424 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009425 _PyUnicode_FastCopyCharacters(v, 0,
9426 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009427 maxchar_old = fixfct(v);
9428 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 }
9430 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009431 _PyUnicode_FastCopyCharacters(v, 0,
9432 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009434 Py_DECREF(u);
9435 assert(_PyUnicode_CheckConsistency(v, 1));
9436 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437}
9438
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009439static PyObject *
9440ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009442 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9443 char *resdata, *data = PyUnicode_DATA(self);
9444 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009445
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009446 res = PyUnicode_New(len, 127);
9447 if (res == NULL)
9448 return NULL;
9449 resdata = PyUnicode_DATA(res);
9450 if (lower)
9451 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009453 _Py_bytes_upper(resdata, data, len);
9454 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455}
9456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009458handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460 Py_ssize_t j;
9461 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009462 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009463 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009464
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9466
9467 where ! is a negation and \p{xxx} is a character with property xxx.
9468 */
9469 for (j = i - 1; j >= 0; j--) {
9470 c = PyUnicode_READ(kind, data, j);
9471 if (!_PyUnicode_IsCaseIgnorable(c))
9472 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009474 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9475 if (final_sigma) {
9476 for (j = i + 1; j < length; j++) {
9477 c = PyUnicode_READ(kind, data, j);
9478 if (!_PyUnicode_IsCaseIgnorable(c))
9479 break;
9480 }
9481 final_sigma = j == length || !_PyUnicode_IsCased(c);
9482 }
9483 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484}
9485
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009486static int
9487lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9488 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009490 /* Obscure special case. */
9491 if (c == 0x3A3) {
9492 mapped[0] = handle_capital_sigma(kind, data, length, i);
9493 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009495 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496}
9497
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009498static Py_ssize_t
9499do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009501 Py_ssize_t i, k = 0;
9502 int n_res, j;
9503 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009504
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505 c = PyUnicode_READ(kind, data, 0);
9506 n_res = _PyUnicode_ToUpperFull(c, mapped);
9507 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009508 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009509 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009511 for (i = 1; i < length; i++) {
9512 c = PyUnicode_READ(kind, data, i);
9513 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9514 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009515 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009516 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009517 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009518 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009519 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520}
9521
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009522static Py_ssize_t
9523do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9524 Py_ssize_t i, k = 0;
9525
9526 for (i = 0; i < length; i++) {
9527 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9528 int n_res, j;
9529 if (Py_UNICODE_ISUPPER(c)) {
9530 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9531 }
9532 else if (Py_UNICODE_ISLOWER(c)) {
9533 n_res = _PyUnicode_ToUpperFull(c, mapped);
9534 }
9535 else {
9536 n_res = 1;
9537 mapped[0] = c;
9538 }
9539 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009540 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009541 res[k++] = mapped[j];
9542 }
9543 }
9544 return k;
9545}
9546
9547static Py_ssize_t
9548do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9549 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009551 Py_ssize_t i, k = 0;
9552
9553 for (i = 0; i < length; i++) {
9554 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9555 int n_res, j;
9556 if (lower)
9557 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9558 else
9559 n_res = _PyUnicode_ToUpperFull(c, mapped);
9560 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009561 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009562 res[k++] = mapped[j];
9563 }
9564 }
9565 return k;
9566}
9567
9568static Py_ssize_t
9569do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9570{
9571 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9572}
9573
9574static Py_ssize_t
9575do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9576{
9577 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9578}
9579
Benjamin Petersone51757f2012-01-12 21:10:29 -05009580static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009581do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9582{
9583 Py_ssize_t i, k = 0;
9584
9585 for (i = 0; i < length; i++) {
9586 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9587 Py_UCS4 mapped[3];
9588 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9589 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009590 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009591 res[k++] = mapped[j];
9592 }
9593 }
9594 return k;
9595}
9596
9597static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009598do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9599{
9600 Py_ssize_t i, k = 0;
9601 int previous_is_cased;
9602
9603 previous_is_cased = 0;
9604 for (i = 0; i < length; i++) {
9605 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9606 Py_UCS4 mapped[3];
9607 int n_res, j;
9608
9609 if (previous_is_cased)
9610 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9611 else
9612 n_res = _PyUnicode_ToTitleFull(c, mapped);
9613
9614 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009615 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009616 res[k++] = mapped[j];
9617 }
9618
9619 previous_is_cased = _PyUnicode_IsCased(c);
9620 }
9621 return k;
9622}
9623
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624static PyObject *
9625case_operation(PyObject *self,
9626 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9627{
9628 PyObject *res = NULL;
9629 Py_ssize_t length, newlength = 0;
9630 int kind, outkind;
9631 void *data, *outdata;
9632 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9633
Benjamin Petersoneea48462012-01-16 14:28:50 -05009634 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635
9636 kind = PyUnicode_KIND(self);
9637 data = PyUnicode_DATA(self);
9638 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009639 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009640 PyErr_SetString(PyExc_OverflowError, "string is too long");
9641 return NULL;
9642 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009643 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 if (tmp == NULL)
9645 return PyErr_NoMemory();
9646 newlength = perform(kind, data, length, tmp, &maxchar);
9647 res = PyUnicode_New(newlength, maxchar);
9648 if (res == NULL)
9649 goto leave;
9650 tmpend = tmp + newlength;
9651 outdata = PyUnicode_DATA(res);
9652 outkind = PyUnicode_KIND(res);
9653 switch (outkind) {
9654 case PyUnicode_1BYTE_KIND:
9655 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9656 break;
9657 case PyUnicode_2BYTE_KIND:
9658 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9659 break;
9660 case PyUnicode_4BYTE_KIND:
9661 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9662 break;
9663 default:
9664 assert(0);
9665 break;
9666 }
9667 leave:
9668 PyMem_FREE(tmp);
9669 return res;
9670}
9671
Tim Peters8ce9f162004-08-27 01:49:32 +00009672PyObject *
9673PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009676 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009678 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009679 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9680 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009681 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009683 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009685 int use_memcpy;
9686 unsigned char *res_data = NULL, *sep_data = NULL;
9687 PyObject *last_obj;
9688 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009690 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009691 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009692 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009693 }
9694
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009695 /* NOTE: the following code can't call back into Python code,
9696 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009697 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009698
Tim Peters05eba1f2004-08-27 21:32:02 +00009699 seqlen = PySequence_Fast_GET_SIZE(fseq);
9700 /* If empty sequence, return u"". */
9701 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009702 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009703 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009704 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009705
Tim Peters05eba1f2004-08-27 21:32:02 +00009706 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009707 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009708 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009709 if (seqlen == 1) {
9710 if (PyUnicode_CheckExact(items[0])) {
9711 res = items[0];
9712 Py_INCREF(res);
9713 Py_DECREF(fseq);
9714 return res;
9715 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009716 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009717 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009718 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009720 /* Set up sep and seplen */
9721 if (separator == NULL) {
9722 /* fall back to a blank space separator */
9723 sep = PyUnicode_FromOrdinal(' ');
9724 if (!sep)
9725 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009726 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009727 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009728 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009729 else {
9730 if (!PyUnicode_Check(separator)) {
9731 PyErr_Format(PyExc_TypeError,
9732 "separator: expected str instance,"
9733 " %.80s found",
9734 Py_TYPE(separator)->tp_name);
9735 goto onError;
9736 }
9737 if (PyUnicode_READY(separator))
9738 goto onError;
9739 sep = separator;
9740 seplen = PyUnicode_GET_LENGTH(separator);
9741 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9742 /* inc refcount to keep this code path symmetric with the
9743 above case of a blank separator */
9744 Py_INCREF(sep);
9745 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009746 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009747 }
9748
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009749 /* There are at least two things to join, or else we have a subclass
9750 * of str in the sequence.
9751 * Do a pre-pass to figure out the total amount of space we'll
9752 * need (sz), and see whether all argument are strings.
9753 */
9754 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009755#ifdef Py_DEBUG
9756 use_memcpy = 0;
9757#else
9758 use_memcpy = 1;
9759#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009760 for (i = 0; i < seqlen; i++) {
9761 const Py_ssize_t old_sz = sz;
9762 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 if (!PyUnicode_Check(item)) {
9764 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009765 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009766 " %.80s found",
9767 i, Py_TYPE(item)->tp_name);
9768 goto onError;
9769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 if (PyUnicode_READY(item) == -1)
9771 goto onError;
9772 sz += PyUnicode_GET_LENGTH(item);
9773 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009774 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009775 if (i != 0)
9776 sz += seplen;
9777 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9778 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009779 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009780 goto onError;
9781 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009782 if (use_memcpy && last_obj != NULL) {
9783 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9784 use_memcpy = 0;
9785 }
9786 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009787 }
Tim Petersced69f82003-09-16 20:30:58 +00009788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009790 if (res == NULL)
9791 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009792
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009793 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009794#ifdef Py_DEBUG
9795 use_memcpy = 0;
9796#else
9797 if (use_memcpy) {
9798 res_data = PyUnicode_1BYTE_DATA(res);
9799 kind = PyUnicode_KIND(res);
9800 if (seplen != 0)
9801 sep_data = PyUnicode_1BYTE_DATA(sep);
9802 }
9803#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009804 if (use_memcpy) {
9805 for (i = 0; i < seqlen; ++i) {
9806 Py_ssize_t itemlen;
9807 item = items[i];
9808
9809 /* Copy item, and maybe the separator. */
9810 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009811 Py_MEMCPY(res_data,
9812 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009813 kind * seplen);
9814 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009815 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009816
9817 itemlen = PyUnicode_GET_LENGTH(item);
9818 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009819 Py_MEMCPY(res_data,
9820 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009821 kind * itemlen);
9822 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009823 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009824 }
9825 assert(res_data == PyUnicode_1BYTE_DATA(res)
9826 + kind * PyUnicode_GET_LENGTH(res));
9827 }
9828 else {
9829 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9830 Py_ssize_t itemlen;
9831 item = items[i];
9832
9833 /* Copy item, and maybe the separator. */
9834 if (i && seplen != 0) {
9835 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9836 res_offset += seplen;
9837 }
9838
9839 itemlen = PyUnicode_GET_LENGTH(item);
9840 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009841 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009842 res_offset += itemlen;
9843 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009844 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009845 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009846 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009847
Tim Peters05eba1f2004-08-27 21:32:02 +00009848 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009850 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009854 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009856 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857 return NULL;
9858}
9859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860#define FILL(kind, data, value, start, length) \
9861 do { \
9862 Py_ssize_t i_ = 0; \
9863 assert(kind != PyUnicode_WCHAR_KIND); \
9864 switch ((kind)) { \
9865 case PyUnicode_1BYTE_KIND: { \
9866 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009867 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 break; \
9869 } \
9870 case PyUnicode_2BYTE_KIND: { \
9871 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9872 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9873 break; \
9874 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009875 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9877 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9878 break; \
9879 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009880 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 } \
9882 } while (0)
9883
Victor Stinnerd3f08822012-05-29 12:57:52 +02009884void
9885_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9886 Py_UCS4 fill_char)
9887{
9888 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9889 const void *data = PyUnicode_DATA(unicode);
9890 assert(PyUnicode_IS_READY(unicode));
9891 assert(unicode_modifiable(unicode));
9892 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9893 assert(start >= 0);
9894 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9895 FILL(kind, data, fill_char, start, length);
9896}
9897
Victor Stinner3fe55312012-01-04 00:33:50 +01009898Py_ssize_t
9899PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9900 Py_UCS4 fill_char)
9901{
9902 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009903
9904 if (!PyUnicode_Check(unicode)) {
9905 PyErr_BadInternalCall();
9906 return -1;
9907 }
9908 if (PyUnicode_READY(unicode) == -1)
9909 return -1;
9910 if (unicode_check_modifiable(unicode))
9911 return -1;
9912
Victor Stinnerd3f08822012-05-29 12:57:52 +02009913 if (start < 0) {
9914 PyErr_SetString(PyExc_IndexError, "string index out of range");
9915 return -1;
9916 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009917 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9918 PyErr_SetString(PyExc_ValueError,
9919 "fill character is bigger than "
9920 "the string maximum character");
9921 return -1;
9922 }
9923
9924 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9925 length = Py_MIN(maxlen, length);
9926 if (length <= 0)
9927 return 0;
9928
Victor Stinnerd3f08822012-05-29 12:57:52 +02009929 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009930 return length;
9931}
9932
Victor Stinner9310abb2011-10-05 00:59:23 +02009933static PyObject *
9934pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009935 Py_ssize_t left,
9936 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 PyObject *u;
9940 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009941 int kind;
9942 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943
9944 if (left < 0)
9945 left = 0;
9946 if (right < 0)
9947 right = 0;
9948
Victor Stinnerc4b49542011-12-11 22:44:26 +01009949 if (left == 0 && right == 0)
9950 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9953 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009954 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9955 return NULL;
9956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009958 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009960 if (!u)
9961 return NULL;
9962
9963 kind = PyUnicode_KIND(u);
9964 data = PyUnicode_DATA(u);
9965 if (left)
9966 FILL(kind, data, fill, 0, left);
9967 if (right)
9968 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009969 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009970 assert(_PyUnicode_CheckConsistency(u, 1));
9971 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972}
9973
Alexander Belopolsky40018472011-02-26 01:02:56 +00009974PyObject *
9975PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978
9979 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009980 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009981 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009982 if (PyUnicode_READY(string) == -1) {
9983 Py_DECREF(string);
9984 return NULL;
9985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986
Benjamin Petersonead6b532011-12-20 17:23:42 -06009987 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009989 if (PyUnicode_IS_ASCII(string))
9990 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009991 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009992 PyUnicode_GET_LENGTH(string), keepends);
9993 else
9994 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009995 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009996 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 break;
9998 case PyUnicode_2BYTE_KIND:
9999 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010000 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 PyUnicode_GET_LENGTH(string), keepends);
10002 break;
10003 case PyUnicode_4BYTE_KIND:
10004 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010005 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 PyUnicode_GET_LENGTH(string), keepends);
10007 break;
10008 default:
10009 assert(0);
10010 list = 0;
10011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 Py_DECREF(string);
10013 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014}
10015
Alexander Belopolsky40018472011-02-26 01:02:56 +000010016static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010017split(PyObject *self,
10018 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010019 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010021 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 void *buf1, *buf2;
10023 Py_ssize_t len1, len2;
10024 PyObject* out;
10025
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010027 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 if (PyUnicode_READY(self) == -1)
10030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010033 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010035 if (PyUnicode_IS_ASCII(self))
10036 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010037 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010038 PyUnicode_GET_LENGTH(self), maxcount
10039 );
10040 else
10041 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010042 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010043 PyUnicode_GET_LENGTH(self), maxcount
10044 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 case PyUnicode_2BYTE_KIND:
10046 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010047 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 PyUnicode_GET_LENGTH(self), maxcount
10049 );
10050 case PyUnicode_4BYTE_KIND:
10051 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010052 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 PyUnicode_GET_LENGTH(self), maxcount
10054 );
10055 default:
10056 assert(0);
10057 return NULL;
10058 }
10059
10060 if (PyUnicode_READY(substring) == -1)
10061 return NULL;
10062
10063 kind1 = PyUnicode_KIND(self);
10064 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 len1 = PyUnicode_GET_LENGTH(self);
10066 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010067 if (kind1 < kind2 || len1 < len2) {
10068 out = PyList_New(1);
10069 if (out == NULL)
10070 return NULL;
10071 Py_INCREF(self);
10072 PyList_SET_ITEM(out, 0, self);
10073 return out;
10074 }
10075 buf1 = PyUnicode_DATA(self);
10076 buf2 = PyUnicode_DATA(substring);
10077 if (kind2 != kind1) {
10078 buf2 = _PyUnicode_AsKind(substring, kind1);
10079 if (!buf2)
10080 return NULL;
10081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010083 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010085 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10086 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010087 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010088 else
10089 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010090 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 break;
10092 case PyUnicode_2BYTE_KIND:
10093 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010094 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 break;
10096 case PyUnicode_4BYTE_KIND:
10097 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010098 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 break;
10100 default:
10101 out = NULL;
10102 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010103 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 PyMem_Free(buf2);
10105 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106}
10107
Alexander Belopolsky40018472011-02-26 01:02:56 +000010108static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010109rsplit(PyObject *self,
10110 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010111 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010112{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010113 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 void *buf1, *buf2;
10115 Py_ssize_t len1, len2;
10116 PyObject* out;
10117
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010118 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010119 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 if (PyUnicode_READY(self) == -1)
10122 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010125 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010127 if (PyUnicode_IS_ASCII(self))
10128 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010129 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010130 PyUnicode_GET_LENGTH(self), maxcount
10131 );
10132 else
10133 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010134 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010135 PyUnicode_GET_LENGTH(self), maxcount
10136 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 case PyUnicode_2BYTE_KIND:
10138 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010139 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 PyUnicode_GET_LENGTH(self), maxcount
10141 );
10142 case PyUnicode_4BYTE_KIND:
10143 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010144 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 PyUnicode_GET_LENGTH(self), maxcount
10146 );
10147 default:
10148 assert(0);
10149 return NULL;
10150 }
10151
10152 if (PyUnicode_READY(substring) == -1)
10153 return NULL;
10154
10155 kind1 = PyUnicode_KIND(self);
10156 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 len1 = PyUnicode_GET_LENGTH(self);
10158 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010159 if (kind1 < kind2 || len1 < len2) {
10160 out = PyList_New(1);
10161 if (out == NULL)
10162 return NULL;
10163 Py_INCREF(self);
10164 PyList_SET_ITEM(out, 0, self);
10165 return out;
10166 }
10167 buf1 = PyUnicode_DATA(self);
10168 buf2 = PyUnicode_DATA(substring);
10169 if (kind2 != kind1) {
10170 buf2 = _PyUnicode_AsKind(substring, kind1);
10171 if (!buf2)
10172 return NULL;
10173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010175 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010177 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10178 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010179 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010180 else
10181 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010182 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 break;
10184 case PyUnicode_2BYTE_KIND:
10185 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 break;
10188 case PyUnicode_4BYTE_KIND:
10189 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010190 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 break;
10192 default:
10193 out = NULL;
10194 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010195 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 PyMem_Free(buf2);
10197 return out;
10198}
10199
10200static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010201anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10202 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010204 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10207 return asciilib_find(buf1, len1, buf2, len2, offset);
10208 else
10209 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 case PyUnicode_2BYTE_KIND:
10211 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10212 case PyUnicode_4BYTE_KIND:
10213 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10214 }
10215 assert(0);
10216 return -1;
10217}
10218
10219static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010220anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10221 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010223 switch (kind) {
10224 case PyUnicode_1BYTE_KIND:
10225 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10226 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10227 else
10228 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10229 case PyUnicode_2BYTE_KIND:
10230 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10231 case PyUnicode_4BYTE_KIND:
10232 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10233 }
10234 assert(0);
10235 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010236}
10237
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010238static void
10239replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10240 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10241{
10242 int kind = PyUnicode_KIND(u);
10243 void *data = PyUnicode_DATA(u);
10244 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10245 if (kind == PyUnicode_1BYTE_KIND) {
10246 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10247 (Py_UCS1 *)data + len,
10248 u1, u2, maxcount);
10249 }
10250 else if (kind == PyUnicode_2BYTE_KIND) {
10251 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10252 (Py_UCS2 *)data + len,
10253 u1, u2, maxcount);
10254 }
10255 else {
10256 assert(kind == PyUnicode_4BYTE_KIND);
10257 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10258 (Py_UCS4 *)data + len,
10259 u1, u2, maxcount);
10260 }
10261}
10262
Alexander Belopolsky40018472011-02-26 01:02:56 +000010263static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264replace(PyObject *self, PyObject *str1,
10265 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 PyObject *u;
10268 char *sbuf = PyUnicode_DATA(self);
10269 char *buf1 = PyUnicode_DATA(str1);
10270 char *buf2 = PyUnicode_DATA(str2);
10271 int srelease = 0, release1 = 0, release2 = 0;
10272 int skind = PyUnicode_KIND(self);
10273 int kind1 = PyUnicode_KIND(str1);
10274 int kind2 = PyUnicode_KIND(str2);
10275 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10276 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10277 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010278 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010279 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280
10281 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010282 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010284 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285
Victor Stinner59de0ee2011-10-07 10:01:28 +020010286 if (str1 == str2)
10287 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288
Victor Stinner49a0a212011-10-12 23:46:10 +020010289 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010290 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10291 if (maxchar < maxchar_str1)
10292 /* substring too wide to be present */
10293 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010294 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10295 /* Replacing str1 with str2 may cause a maxchar reduction in the
10296 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010297 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010298 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010303 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010306 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010307 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010308
Victor Stinner69ed0f42013-04-09 21:48:24 +020010309 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010310 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010311 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010313 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010315 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010317
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010318 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10319 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010320 }
10321 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 int rkind = skind;
10323 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010324 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (kind1 < rkind) {
10327 /* widen substring */
10328 buf1 = _PyUnicode_AsKind(str1, rkind);
10329 if (!buf1) goto error;
10330 release1 = 1;
10331 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010333 if (i < 0)
10334 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (rkind > kind2) {
10336 /* widen replacement */
10337 buf2 = _PyUnicode_AsKind(str2, rkind);
10338 if (!buf2) goto error;
10339 release2 = 1;
10340 }
10341 else if (rkind < kind2) {
10342 /* widen self and buf1 */
10343 rkind = kind2;
10344 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010345 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 sbuf = _PyUnicode_AsKind(self, rkind);
10347 if (!sbuf) goto error;
10348 srelease = 1;
10349 buf1 = _PyUnicode_AsKind(str1, rkind);
10350 if (!buf1) goto error;
10351 release1 = 1;
10352 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010353 u = PyUnicode_New(slen, maxchar);
10354 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010356 assert(PyUnicode_KIND(u) == rkind);
10357 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010358
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010359 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010360 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010363 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010365
10366 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010368 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010370 if (i == -1)
10371 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010372 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010374 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010378 }
10379 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010381 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 int rkind = skind;
10383 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010386 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 buf1 = _PyUnicode_AsKind(str1, rkind);
10388 if (!buf1) goto error;
10389 release1 = 1;
10390 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010392 if (n == 0)
10393 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010395 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 buf2 = _PyUnicode_AsKind(str2, rkind);
10397 if (!buf2) goto error;
10398 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010401 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 rkind = kind2;
10403 sbuf = _PyUnicode_AsKind(self, rkind);
10404 if (!sbuf) goto error;
10405 srelease = 1;
10406 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010407 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 buf1 = _PyUnicode_AsKind(str1, rkind);
10409 if (!buf1) goto error;
10410 release1 = 1;
10411 }
10412 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10413 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010414 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 PyErr_SetString(PyExc_OverflowError,
10416 "replace string is too long");
10417 goto error;
10418 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010419 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010420 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010421 _Py_INCREF_UNICODE_EMPTY();
10422 if (!unicode_empty)
10423 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010424 u = unicode_empty;
10425 goto done;
10426 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010427 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 PyErr_SetString(PyExc_OverflowError,
10429 "replace string is too long");
10430 goto error;
10431 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010432 u = PyUnicode_New(new_size, maxchar);
10433 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010435 assert(PyUnicode_KIND(u) == rkind);
10436 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 ires = i = 0;
10438 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010439 while (n-- > 0) {
10440 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010441 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010442 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010443 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010444 if (j == -1)
10445 break;
10446 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010447 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010448 memcpy(res + rkind * ires,
10449 sbuf + rkind * i,
10450 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 }
10453 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010457 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010464 memcpy(res + rkind * ires,
10465 sbuf + rkind * i,
10466 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010467 }
10468 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469 /* interleave */
10470 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010471 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010473 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010475 if (--n <= 0)
10476 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010477 memcpy(res + rkind * ires,
10478 sbuf + rkind * i,
10479 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 ires++;
10481 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010482 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010483 memcpy(res + rkind * ires,
10484 sbuf + rkind * i,
10485 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010487 }
10488
10489 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010490 unicode_adjust_maxchar(&u);
10491 if (u == NULL)
10492 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010494
10495 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 if (srelease)
10497 PyMem_FREE(sbuf);
10498 if (release1)
10499 PyMem_FREE(buf1);
10500 if (release2)
10501 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010502 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010506 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 if (srelease)
10508 PyMem_FREE(sbuf);
10509 if (release1)
10510 PyMem_FREE(buf1);
10511 if (release2)
10512 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010513 return unicode_result_unchanged(self);
10514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 error:
10516 if (srelease && sbuf)
10517 PyMem_FREE(sbuf);
10518 if (release1 && buf1)
10519 PyMem_FREE(buf1);
10520 if (release2 && buf2)
10521 PyMem_FREE(buf2);
10522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523}
10524
10525/* --- Unicode Object Methods --------------------------------------------- */
10526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010527PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529\n\
10530Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010531characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
10533static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010534unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010536 if (PyUnicode_READY(self) == -1)
10537 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010538 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539}
10540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010541PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010542 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543\n\
10544Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010545have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546
10547static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010548unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010550 if (PyUnicode_READY(self) == -1)
10551 return NULL;
10552 if (PyUnicode_GET_LENGTH(self) == 0)
10553 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010554 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555}
10556
Benjamin Petersond5890c82012-01-14 13:23:30 -050010557PyDoc_STRVAR(casefold__doc__,
10558 "S.casefold() -> str\n\
10559\n\
10560Return a version of S suitable for caseless comparisons.");
10561
10562static PyObject *
10563unicode_casefold(PyObject *self)
10564{
10565 if (PyUnicode_READY(self) == -1)
10566 return NULL;
10567 if (PyUnicode_IS_ASCII(self))
10568 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010569 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010570}
10571
10572
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010573/* Argument converter. Coerces to a single unicode character */
10574
10575static int
10576convert_uc(PyObject *obj, void *addr)
10577{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010580
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581 uniobj = PyUnicode_FromObject(obj);
10582 if (uniobj == NULL) {
10583 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010584 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585 return 0;
10586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010588 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010590 Py_DECREF(uniobj);
10591 return 0;
10592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010594 Py_DECREF(uniobj);
10595 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010596}
10597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010598PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010601Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010602done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
10604static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010605unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010607 Py_ssize_t marg, left;
10608 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 Py_UCS4 fillchar = ' ';
10610
Victor Stinnere9a29352011-10-01 02:14:59 +020010611 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
Benjamin Petersonbac79492012-01-14 13:34:47 -050010614 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 return NULL;
10616
Victor Stinnerc4b49542011-12-11 22:44:26 +010010617 if (PyUnicode_GET_LENGTH(self) >= width)
10618 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619
Victor Stinnerc4b49542011-12-11 22:44:26 +010010620 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 left = marg / 2 + (marg & width & 1);
10622
Victor Stinner9310abb2011-10-05 00:59:23 +020010623 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624}
10625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626/* This function assumes that str1 and str2 are readied by the caller. */
10627
Marc-André Lemburge5034372000-08-08 08:04:29 +000010628static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010629unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010630{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010631#define COMPARE(TYPE1, TYPE2) \
10632 do { \
10633 TYPE1* p1 = (TYPE1 *)data1; \
10634 TYPE2* p2 = (TYPE2 *)data2; \
10635 TYPE1* end = p1 + len; \
10636 Py_UCS4 c1, c2; \
10637 for (; p1 != end; p1++, p2++) { \
10638 c1 = *p1; \
10639 c2 = *p2; \
10640 if (c1 != c2) \
10641 return (c1 < c2) ? -1 : 1; \
10642 } \
10643 } \
10644 while (0)
10645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 int kind1, kind2;
10647 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010648 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 kind1 = PyUnicode_KIND(str1);
10651 kind2 = PyUnicode_KIND(str2);
10652 data1 = PyUnicode_DATA(str1);
10653 data2 = PyUnicode_DATA(str2);
10654 len1 = PyUnicode_GET_LENGTH(str1);
10655 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010656 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010657
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010658 switch(kind1) {
10659 case PyUnicode_1BYTE_KIND:
10660 {
10661 switch(kind2) {
10662 case PyUnicode_1BYTE_KIND:
10663 {
10664 int cmp = memcmp(data1, data2, len);
10665 /* normalize result of memcmp() into the range [-1; 1] */
10666 if (cmp < 0)
10667 return -1;
10668 if (cmp > 0)
10669 return 1;
10670 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010671 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010672 case PyUnicode_2BYTE_KIND:
10673 COMPARE(Py_UCS1, Py_UCS2);
10674 break;
10675 case PyUnicode_4BYTE_KIND:
10676 COMPARE(Py_UCS1, Py_UCS4);
10677 break;
10678 default:
10679 assert(0);
10680 }
10681 break;
10682 }
10683 case PyUnicode_2BYTE_KIND:
10684 {
10685 switch(kind2) {
10686 case PyUnicode_1BYTE_KIND:
10687 COMPARE(Py_UCS2, Py_UCS1);
10688 break;
10689 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010690 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010691 COMPARE(Py_UCS2, Py_UCS2);
10692 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010693 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010694 case PyUnicode_4BYTE_KIND:
10695 COMPARE(Py_UCS2, Py_UCS4);
10696 break;
10697 default:
10698 assert(0);
10699 }
10700 break;
10701 }
10702 case PyUnicode_4BYTE_KIND:
10703 {
10704 switch(kind2) {
10705 case PyUnicode_1BYTE_KIND:
10706 COMPARE(Py_UCS4, Py_UCS1);
10707 break;
10708 case PyUnicode_2BYTE_KIND:
10709 COMPARE(Py_UCS4, Py_UCS2);
10710 break;
10711 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010712 {
10713#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10714 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10715 /* normalize result of wmemcmp() into the range [-1; 1] */
10716 if (cmp < 0)
10717 return -1;
10718 if (cmp > 0)
10719 return 1;
10720#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010721 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010722#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010723 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010724 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010725 default:
10726 assert(0);
10727 }
10728 break;
10729 }
10730 default:
10731 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010732 }
10733
Victor Stinner770e19e2012-10-04 22:59:45 +020010734 if (len1 == len2)
10735 return 0;
10736 if (len1 < len2)
10737 return -1;
10738 else
10739 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010740
10741#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010742}
10743
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010744Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010745unicode_compare_eq(PyObject *str1, PyObject *str2)
10746{
10747 int kind;
10748 void *data1, *data2;
10749 Py_ssize_t len;
10750 int cmp;
10751
Victor Stinnere5567ad2012-10-23 02:48:49 +020010752 len = PyUnicode_GET_LENGTH(str1);
10753 if (PyUnicode_GET_LENGTH(str2) != len)
10754 return 0;
10755 kind = PyUnicode_KIND(str1);
10756 if (PyUnicode_KIND(str2) != kind)
10757 return 0;
10758 data1 = PyUnicode_DATA(str1);
10759 data2 = PyUnicode_DATA(str2);
10760
10761 cmp = memcmp(data1, data2, len * kind);
10762 return (cmp == 0);
10763}
10764
10765
Alexander Belopolsky40018472011-02-26 01:02:56 +000010766int
10767PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10770 if (PyUnicode_READY(left) == -1 ||
10771 PyUnicode_READY(right) == -1)
10772 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010773
10774 /* a string is equal to itself */
10775 if (left == right)
10776 return 0;
10777
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010778 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010780 PyErr_Format(PyExc_TypeError,
10781 "Can't compare %.100s and %.100s",
10782 left->ob_type->tp_name,
10783 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784 return -1;
10785}
10786
Martin v. Löwis5b222132007-06-10 09:51:05 +000010787int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010788_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10789{
10790 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10791 if (right_str == NULL)
10792 return -1;
10793 return PyUnicode_Compare(left, right_str);
10794}
10795
10796int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010797PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_ssize_t i;
10800 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 Py_UCS4 chr;
10802
Victor Stinner910337b2011-10-03 03:20:16 +020010803 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 if (PyUnicode_READY(uni) == -1)
10805 return -1;
10806 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010807 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010808 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010809 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010810 size_t len, len2 = strlen(str);
10811 int cmp;
10812
10813 len = Py_MIN(len1, len2);
10814 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010815 if (cmp != 0) {
10816 if (cmp < 0)
10817 return -1;
10818 else
10819 return 1;
10820 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010821 if (len1 > len2)
10822 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010823 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010824 return -1; /* str is longer */
10825 return 0;
10826 }
10827 else {
10828 void *data = PyUnicode_DATA(uni);
10829 /* Compare Unicode string and source character set string */
10830 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010831 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010832 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10833 /* This check keeps Python strings that end in '\0' from comparing equal
10834 to C strings identical up to that point. */
10835 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10836 return 1; /* uni is longer */
10837 if (str[i])
10838 return -1; /* str is longer */
10839 return 0;
10840 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010841}
10842
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010843
Benjamin Peterson29060642009-01-31 22:14:21 +000010844#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010845 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010846
Alexander Belopolsky40018472011-02-26 01:02:56 +000010847PyObject *
10848PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010849{
10850 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010851 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010852
Victor Stinnere5567ad2012-10-23 02:48:49 +020010853 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10854 Py_RETURN_NOTIMPLEMENTED;
10855
10856 if (PyUnicode_READY(left) == -1 ||
10857 PyUnicode_READY(right) == -1)
10858 return NULL;
10859
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010860 if (left == right) {
10861 switch (op) {
10862 case Py_EQ:
10863 case Py_LE:
10864 case Py_GE:
10865 /* a string is equal to itself */
10866 v = Py_True;
10867 break;
10868 case Py_NE:
10869 case Py_LT:
10870 case Py_GT:
10871 v = Py_False;
10872 break;
10873 default:
10874 PyErr_BadArgument();
10875 return NULL;
10876 }
10877 }
10878 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010879 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010880 result ^= (op == Py_NE);
10881 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010882 }
10883 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010884 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010885
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010886 /* Convert the return value to a Boolean */
10887 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010888 case Py_LE:
10889 v = TEST_COND(result <= 0);
10890 break;
10891 case Py_GE:
10892 v = TEST_COND(result >= 0);
10893 break;
10894 case Py_LT:
10895 v = TEST_COND(result == -1);
10896 break;
10897 case Py_GT:
10898 v = TEST_COND(result == 1);
10899 break;
10900 default:
10901 PyErr_BadArgument();
10902 return NULL;
10903 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010904 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010905 Py_INCREF(v);
10906 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010907}
10908
Alexander Belopolsky40018472011-02-26 01:02:56 +000010909int
10910PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010911{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010912 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010913 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 void *buf1, *buf2;
10915 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010916 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010917
10918 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 sub = PyUnicode_FromObject(element);
10920 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010921 PyErr_Format(PyExc_TypeError,
10922 "'in <string>' requires string as left operand, not %s",
10923 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010925 }
10926
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010928 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929 Py_DECREF(sub);
10930 return -1;
10931 }
10932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 kind1 = PyUnicode_KIND(str);
10934 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010935 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010937 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010938 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 }
10940 len1 = PyUnicode_GET_LENGTH(str);
10941 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010942 if (len1 < len2) {
10943 Py_DECREF(sub);
10944 Py_DECREF(str);
10945 return 0;
10946 }
10947 buf1 = PyUnicode_DATA(str);
10948 buf2 = PyUnicode_DATA(sub);
10949 if (len2 == 1) {
10950 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10951 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10952 Py_DECREF(sub);
10953 Py_DECREF(str);
10954 return result;
10955 }
10956 if (kind2 != kind1) {
10957 buf2 = _PyUnicode_AsKind(sub, kind1);
10958 if (!buf2) {
10959 Py_DECREF(sub);
10960 Py_DECREF(str);
10961 return -1;
10962 }
10963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964
Victor Stinner77282cb2013-04-14 19:22:47 +020010965 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 case PyUnicode_1BYTE_KIND:
10967 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10968 break;
10969 case PyUnicode_2BYTE_KIND:
10970 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10971 break;
10972 case PyUnicode_4BYTE_KIND:
10973 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10974 break;
10975 default:
10976 result = -1;
10977 assert(0);
10978 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010979
10980 Py_DECREF(str);
10981 Py_DECREF(sub);
10982
Victor Stinner77282cb2013-04-14 19:22:47 +020010983 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 PyMem_Free(buf2);
10985
Guido van Rossum403d68b2000-03-13 15:55:09 +000010986 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010987}
10988
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989/* Concat to string or Unicode object giving a new Unicode object. */
10990
Alexander Belopolsky40018472011-02-26 01:02:56 +000010991PyObject *
10992PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010995 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010996 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997
10998 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011001 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
11006 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011007 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011011 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 }
11015
Victor Stinner488fa492011-12-12 00:01:39 +010011016 u_len = PyUnicode_GET_LENGTH(u);
11017 v_len = PyUnicode_GET_LENGTH(v);
11018 if (u_len > PY_SSIZE_T_MAX - v_len) {
11019 PyErr_SetString(PyExc_OverflowError,
11020 "strings are too large to concat");
11021 goto onError;
11022 }
11023 new_len = u_len + v_len;
11024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011026 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011027 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011030 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011033 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11034 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 Py_DECREF(u);
11036 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011037 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 Py_XDECREF(u);
11042 Py_XDECREF(v);
11043 return NULL;
11044}
11045
Walter Dörwald1ab83302007-05-18 17:15:44 +000011046void
Victor Stinner23e56682011-10-03 03:54:37 +020011047PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011048{
Victor Stinner23e56682011-10-03 03:54:37 +020011049 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011050 Py_UCS4 maxchar, maxchar2;
11051 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011052
11053 if (p_left == NULL) {
11054 if (!PyErr_Occurred())
11055 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011056 return;
11057 }
Victor Stinner23e56682011-10-03 03:54:37 +020011058 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011059 if (right == NULL || left == NULL
11060 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011061 if (!PyErr_Occurred())
11062 PyErr_BadInternalCall();
11063 goto error;
11064 }
11065
Benjamin Petersonbac79492012-01-14 13:34:47 -050011066 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011067 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011068 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011069 goto error;
11070
Victor Stinner488fa492011-12-12 00:01:39 +010011071 /* Shortcuts */
11072 if (left == unicode_empty) {
11073 Py_DECREF(left);
11074 Py_INCREF(right);
11075 *p_left = right;
11076 return;
11077 }
11078 if (right == unicode_empty)
11079 return;
11080
11081 left_len = PyUnicode_GET_LENGTH(left);
11082 right_len = PyUnicode_GET_LENGTH(right);
11083 if (left_len > PY_SSIZE_T_MAX - right_len) {
11084 PyErr_SetString(PyExc_OverflowError,
11085 "strings are too large to concat");
11086 goto error;
11087 }
11088 new_len = left_len + right_len;
11089
11090 if (unicode_modifiable(left)
11091 && PyUnicode_CheckExact(right)
11092 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011093 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11094 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011095 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011096 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011097 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11098 {
11099 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011100 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011101 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011102
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011103 /* copy 'right' into the newly allocated area of 'left' */
11104 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011105 }
Victor Stinner488fa492011-12-12 00:01:39 +010011106 else {
11107 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11108 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011109 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011110
Victor Stinner488fa492011-12-12 00:01:39 +010011111 /* Concat the two Unicode strings */
11112 res = PyUnicode_New(new_len, maxchar);
11113 if (res == NULL)
11114 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011115 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11116 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011117 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011118 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011119 }
11120 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011121 return;
11122
11123error:
Victor Stinner488fa492011-12-12 00:01:39 +010011124 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011125}
11126
11127void
11128PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11129{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011130 PyUnicode_Append(pleft, right);
11131 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011132}
11133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011134PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011137Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011138string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011139interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
11141static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011142unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011144 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011145 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011146 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011148 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 void *buf1, *buf2;
11150 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151
Jesus Ceaac451502011-04-20 17:09:23 +020011152 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11153 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 kind1 = PyUnicode_KIND(self);
11157 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011158 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011159 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011160 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 len1 = PyUnicode_GET_LENGTH(self);
11163 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011165 if (end - start < len2) {
11166 Py_DECREF(substring);
11167 return PyLong_FromLong(0);
11168 }
11169 buf1 = PyUnicode_DATA(self);
11170 buf2 = PyUnicode_DATA(substring);
11171 if (kind2 != kind1) {
11172 buf2 = _PyUnicode_AsKind(substring, kind1);
11173 if (!buf2) {
11174 Py_DECREF(substring);
11175 return NULL;
11176 }
11177 }
11178 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 case PyUnicode_1BYTE_KIND:
11180 iresult = ucs1lib_count(
11181 ((Py_UCS1*)buf1) + start, end - start,
11182 buf2, len2, PY_SSIZE_T_MAX
11183 );
11184 break;
11185 case PyUnicode_2BYTE_KIND:
11186 iresult = ucs2lib_count(
11187 ((Py_UCS2*)buf1) + start, end - start,
11188 buf2, len2, PY_SSIZE_T_MAX
11189 );
11190 break;
11191 case PyUnicode_4BYTE_KIND:
11192 iresult = ucs4lib_count(
11193 ((Py_UCS4*)buf1) + start, end - start,
11194 buf2, len2, PY_SSIZE_T_MAX
11195 );
11196 break;
11197 default:
11198 assert(0); iresult = 0;
11199 }
11200
11201 result = PyLong_FromSsize_t(iresult);
11202
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011203 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205
11206 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011207
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 return result;
11209}
11210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011212 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011214Encode S using the codec registered for encoding. Default encoding\n\
11215is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011216handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011217a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11218'xmlcharrefreplace' as well as any other name registered with\n\
11219codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
11221static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011222unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011224 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225 char *encoding = NULL;
11226 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011227
Benjamin Peterson308d6372009-09-18 21:42:35 +000011228 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11229 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011231 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011232}
11233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011234PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011235 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236\n\
11237Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011238If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
11240static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011241unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011243 Py_ssize_t i, j, line_pos, src_len, incr;
11244 Py_UCS4 ch;
11245 PyObject *u;
11246 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011247 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011249 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011250 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
Ezio Melotti745d54d2013-11-16 19:10:57 +020011252 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11253 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255
Antoine Pitrou22425222011-10-04 19:10:51 +020011256 if (PyUnicode_READY(self) == -1)
11257 return NULL;
11258
Thomas Wouters7e474022000-07-16 12:04:32 +000011259 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011260 src_len = PyUnicode_GET_LENGTH(self);
11261 i = j = line_pos = 0;
11262 kind = PyUnicode_KIND(self);
11263 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011264 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 for (; i < src_len; i++) {
11266 ch = PyUnicode_READ(kind, src_data, i);
11267 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011268 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 goto overflow;
11273 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011275 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 goto overflow;
11280 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 if (ch == '\n' || ch == '\r')
11283 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011286 if (!found)
11287 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011288
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 if (!u)
11292 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
Antoine Pitroue71d5742011-10-04 15:55:09 +020011295 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 for (; i < src_len; i++) {
11298 ch = PyUnicode_READ(kind, src_data, i);
11299 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 incr = tabsize - (line_pos % tabsize);
11302 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011303 FILL(kind, dest_data, ' ', j, incr);
11304 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011306 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011308 line_pos++;
11309 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011310 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 if (ch == '\n' || ch == '\r')
11312 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011314 }
11315 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011316 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011317
Antoine Pitroue71d5742011-10-04 15:55:09 +020011318 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011319 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321}
11322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011323PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011324 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325\n\
11326Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011327such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328arguments start and end are interpreted as in slice notation.\n\
11329\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
11332static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011335 /* initialize variables to prevent gcc warning */
11336 PyObject *substring = NULL;
11337 Py_ssize_t start = 0;
11338 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011339 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
Jesus Ceaac451502011-04-20 17:09:23 +020011341 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11342 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
Christian Heimesd47802e2013-06-29 21:33:36 +020011345 if (PyUnicode_READY(self) == -1) {
11346 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011348 }
11349 if (PyUnicode_READY(substring) == -1) {
11350 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353
Victor Stinner7931d9a2011-11-04 00:22:48 +010011354 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355
11356 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 if (result == -2)
11359 return NULL;
11360
Christian Heimes217cfd12007-12-02 14:31:20 +000011361 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362}
11363
11364static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011365unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011367 void *data;
11368 enum PyUnicode_Kind kind;
11369 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011370
11371 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11372 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011374 }
11375 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11376 PyErr_SetString(PyExc_IndexError, "string index out of range");
11377 return NULL;
11378 }
11379 kind = PyUnicode_KIND(self);
11380 data = PyUnicode_DATA(self);
11381 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011382 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383}
11384
Guido van Rossumc2504932007-09-18 19:42:40 +000011385/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011386 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011387static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011388unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389{
Guido van Rossumc2504932007-09-18 19:42:40 +000011390 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011391 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011392
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011393#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011394 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011395#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 if (_PyUnicode_HASH(self) != -1)
11397 return _PyUnicode_HASH(self);
11398 if (PyUnicode_READY(self) == -1)
11399 return -1;
11400 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011401 /*
11402 We make the hash of the empty string be 0, rather than using
11403 (prefix ^ suffix), since this slightly obfuscates the hash secret
11404 */
11405 if (len == 0) {
11406 _PyUnicode_HASH(self) = 0;
11407 return 0;
11408 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011409 x = _Py_HashBytes(PyUnicode_DATA(self),
11410 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011412 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413}
11414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011418Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011423 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011424 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011425 PyObject *substring = NULL;
11426 Py_ssize_t start = 0;
11427 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
Jesus Ceaac451502011-04-20 17:09:23 +020011429 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11430 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
Christian Heimesd47a0452013-06-29 21:21:37 +020011433 if (PyUnicode_READY(self) == -1) {
11434 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011436 }
11437 if (PyUnicode_READY(substring) == -1) {
11438 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441
Victor Stinner7931d9a2011-11-04 00:22:48 +010011442 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 if (result == -2)
11447 return NULL;
11448
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 if (result < 0) {
11450 PyErr_SetString(PyExc_ValueError, "substring not found");
11451 return NULL;
11452 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011453
Christian Heimes217cfd12007-12-02 14:31:20 +000011454 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455}
11456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011457PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011460Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
11463static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011464unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 Py_ssize_t i, length;
11467 int kind;
11468 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 int cased;
11470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 if (PyUnicode_READY(self) == -1)
11472 return NULL;
11473 length = PyUnicode_GET_LENGTH(self);
11474 kind = PyUnicode_KIND(self);
11475 data = PyUnicode_DATA(self);
11476
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 if (length == 1)
11479 return PyBool_FromLong(
11480 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011482 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 for (i = 0; i < length; i++) {
11488 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011489
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11491 return PyBool_FromLong(0);
11492 else if (!cased && Py_UNICODE_ISLOWER(ch))
11493 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011495 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496}
11497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011501Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011502at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503
11504static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011505unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 Py_ssize_t i, length;
11508 int kind;
11509 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 int cased;
11511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 if (PyUnicode_READY(self) == -1)
11513 return NULL;
11514 length = PyUnicode_GET_LENGTH(self);
11515 kind = PyUnicode_KIND(self);
11516 data = PyUnicode_DATA(self);
11517
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 if (length == 1)
11520 return PyBool_FromLong(
11521 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011523 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011526
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 for (i = 0; i < length; i++) {
11529 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011530
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11532 return PyBool_FromLong(0);
11533 else if (!cased && Py_UNICODE_ISUPPER(ch))
11534 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011536 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537}
11538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011539PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011542Return True if S is a titlecased string and there is at least one\n\
11543character in S, i.e. upper- and titlecase characters may only\n\
11544follow uncased characters and lowercase characters only cased ones.\n\
11545Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
11547static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011548unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 Py_ssize_t i, length;
11551 int kind;
11552 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 int cased, previous_is_cased;
11554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 if (PyUnicode_READY(self) == -1)
11556 return NULL;
11557 length = PyUnicode_GET_LENGTH(self);
11558 kind = PyUnicode_KIND(self);
11559 data = PyUnicode_DATA(self);
11560
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 if (length == 1) {
11563 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11564 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11565 (Py_UNICODE_ISUPPER(ch) != 0));
11566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011568 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011571
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 cased = 0;
11573 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 for (i = 0; i < length; i++) {
11575 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011576
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11578 if (previous_is_cased)
11579 return PyBool_FromLong(0);
11580 previous_is_cased = 1;
11581 cased = 1;
11582 }
11583 else if (Py_UNICODE_ISLOWER(ch)) {
11584 if (!previous_is_cased)
11585 return PyBool_FromLong(0);
11586 previous_is_cased = 1;
11587 cased = 1;
11588 }
11589 else
11590 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011592 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593}
11594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011595PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011598Return True if all characters in S are whitespace\n\
11599and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
11601static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011602unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 Py_ssize_t i, length;
11605 int kind;
11606 void *data;
11607
11608 if (PyUnicode_READY(self) == -1)
11609 return NULL;
11610 length = PyUnicode_GET_LENGTH(self);
11611 kind = PyUnicode_KIND(self);
11612 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 if (length == 1)
11616 return PyBool_FromLong(
11617 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011619 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011621 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 for (i = 0; i < length; i++) {
11624 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011625 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011628 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629}
11630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011633\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011634Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011635and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011636
11637static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011638unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 Py_ssize_t i, length;
11641 int kind;
11642 void *data;
11643
11644 if (PyUnicode_READY(self) == -1)
11645 return NULL;
11646 length = PyUnicode_GET_LENGTH(self);
11647 kind = PyUnicode_KIND(self);
11648 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011650 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 if (length == 1)
11652 return PyBool_FromLong(
11653 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654
11655 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 for (i = 0; i < length; i++) {
11660 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011663 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011664}
11665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011666PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011669Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011670and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011671
11672static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011673unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 int kind;
11676 void *data;
11677 Py_ssize_t len, i;
11678
11679 if (PyUnicode_READY(self) == -1)
11680 return NULL;
11681
11682 kind = PyUnicode_KIND(self);
11683 data = PyUnicode_DATA(self);
11684 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011685
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011686 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 if (len == 1) {
11688 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11689 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11690 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691
11692 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 for (i = 0; i < len; i++) {
11697 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011698 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011700 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011701 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011702}
11703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011704PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011707Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011708False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709
11710static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011711unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 Py_ssize_t i, length;
11714 int kind;
11715 void *data;
11716
11717 if (PyUnicode_READY(self) == -1)
11718 return NULL;
11719 length = PyUnicode_GET_LENGTH(self);
11720 kind = PyUnicode_KIND(self);
11721 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 if (length == 1)
11725 return PyBool_FromLong(
11726 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011728 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011730 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 for (i = 0; i < length; i++) {
11733 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011736 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737}
11738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011739PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011742Return True if all characters in S are digits\n\
11743and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
11745static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011746unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 Py_ssize_t i, length;
11749 int kind;
11750 void *data;
11751
11752 if (PyUnicode_READY(self) == -1)
11753 return NULL;
11754 length = PyUnicode_GET_LENGTH(self);
11755 kind = PyUnicode_KIND(self);
11756 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 if (length == 1) {
11760 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11761 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011764 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 for (i = 0; i < length; i++) {
11769 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011772 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773}
11774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011775PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011778Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011779False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
11781static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011782unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 Py_ssize_t i, length;
11785 int kind;
11786 void *data;
11787
11788 if (PyUnicode_READY(self) == -1)
11789 return NULL;
11790 length = PyUnicode_GET_LENGTH(self);
11791 kind = PyUnicode_KIND(self);
11792 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 if (length == 1)
11796 return PyBool_FromLong(
11797 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011799 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 for (i = 0; i < length; i++) {
11804 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011807 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808}
11809
Martin v. Löwis47383402007-08-15 07:32:56 +000011810int
11811PyUnicode_IsIdentifier(PyObject *self)
11812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 int kind;
11814 void *data;
11815 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011816 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (PyUnicode_READY(self) == -1) {
11819 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 }
11822
11823 /* Special case for empty strings */
11824 if (PyUnicode_GET_LENGTH(self) == 0)
11825 return 0;
11826 kind = PyUnicode_KIND(self);
11827 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011828
11829 /* PEP 3131 says that the first character must be in
11830 XID_Start and subsequent characters in XID_Continue,
11831 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011832 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011833 letters, digits, underscore). However, given the current
11834 definition of XID_Start and XID_Continue, it is sufficient
11835 to check just for these, except that _ must be allowed
11836 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011838 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011839 return 0;
11840
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011841 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011844 return 1;
11845}
11846
11847PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011849\n\
11850Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011851to the language definition.\n\
11852\n\
11853Use keyword.iskeyword() to test for reserved identifiers\n\
11854such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011855
11856static PyObject*
11857unicode_isidentifier(PyObject *self)
11858{
11859 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11860}
11861
Georg Brandl559e5d72008-06-11 18:37:52 +000011862PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011864\n\
11865Return True if all characters in S are considered\n\
11866printable in repr() or S is empty, False otherwise.");
11867
11868static PyObject*
11869unicode_isprintable(PyObject *self)
11870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 Py_ssize_t i, length;
11872 int kind;
11873 void *data;
11874
11875 if (PyUnicode_READY(self) == -1)
11876 return NULL;
11877 length = PyUnicode_GET_LENGTH(self);
11878 kind = PyUnicode_KIND(self);
11879 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011880
11881 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (length == 1)
11883 return PyBool_FromLong(
11884 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 for (i = 0; i < length; i++) {
11887 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011888 Py_RETURN_FALSE;
11889 }
11890 }
11891 Py_RETURN_TRUE;
11892}
11893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011894PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011895 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896\n\
11897Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011898iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899
11900static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011901unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011903 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904}
11905
Martin v. Löwis18e16552006-02-15 17:27:45 +000011906static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011907unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (PyUnicode_READY(self) == -1)
11910 return -1;
11911 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912}
11913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011914PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011917Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011918done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919
11920static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011921unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011923 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 Py_UCS4 fillchar = ' ';
11925
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011926 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 return NULL;
11928
Benjamin Petersonbac79492012-01-14 13:34:47 -050011929 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Victor Stinnerc4b49542011-12-11 22:44:26 +010011932 if (PyUnicode_GET_LENGTH(self) >= width)
11933 return unicode_result_unchanged(self);
11934
11935 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936}
11937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011938PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011941Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
11943static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011944unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011946 if (PyUnicode_READY(self) == -1)
11947 return NULL;
11948 if (PyUnicode_IS_ASCII(self))
11949 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011950 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011953#define LEFTSTRIP 0
11954#define RIGHTSTRIP 1
11955#define BOTHSTRIP 2
11956
11957/* Arrays indexed by above */
11958static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11959
11960#define STRIPNAME(i) (stripformat[i]+3)
11961
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011962/* externally visible for str.strip(unicode) */
11963PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011964_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 void *data;
11967 int kind;
11968 Py_ssize_t i, j, len;
11969 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011970 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11973 return NULL;
11974
11975 kind = PyUnicode_KIND(self);
11976 data = PyUnicode_DATA(self);
11977 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011978 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11980 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011981 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011982
Benjamin Peterson14339b62009-01-31 16:36:08 +000011983 i = 0;
11984 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011985 while (i < len) {
11986 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11987 if (!BLOOM(sepmask, ch))
11988 break;
11989 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11990 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 i++;
11992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011993 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011994
Benjamin Peterson14339b62009-01-31 16:36:08 +000011995 j = len;
11996 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011997 j--;
11998 while (j >= i) {
11999 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12000 if (!BLOOM(sepmask, ch))
12001 break;
12002 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12003 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012005 }
12006
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012008 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012009
Victor Stinner7931d9a2011-11-04 00:22:48 +010012010 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011}
12012
12013PyObject*
12014PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12015{
12016 unsigned char *data;
12017 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012018 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019
Victor Stinnerde636f32011-10-01 03:55:54 +020012020 if (PyUnicode_READY(self) == -1)
12021 return NULL;
12022
Victor Stinner684d5fd2012-05-03 02:32:34 +020012023 length = PyUnicode_GET_LENGTH(self);
12024 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012025
Victor Stinner684d5fd2012-05-03 02:32:34 +020012026 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012027 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028
Victor Stinnerde636f32011-10-01 03:55:54 +020012029 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012030 PyErr_SetString(PyExc_IndexError, "string index out of range");
12031 return NULL;
12032 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012033 if (start >= length || end < start)
12034 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012035
Victor Stinner684d5fd2012-05-03 02:32:34 +020012036 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012037 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012038 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012039 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012040 }
12041 else {
12042 kind = PyUnicode_KIND(self);
12043 data = PyUnicode_1BYTE_DATA(self);
12044 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012045 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012046 length);
12047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
12050static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012051do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 Py_ssize_t len, i, j;
12054
12055 if (PyUnicode_READY(self) == -1)
12056 return NULL;
12057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012059
Victor Stinnercc7af722013-04-09 22:39:24 +020012060 if (PyUnicode_IS_ASCII(self)) {
12061 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12062
12063 i = 0;
12064 if (striptype != RIGHTSTRIP) {
12065 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012066 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012067 if (!_Py_ascii_whitespace[ch])
12068 break;
12069 i++;
12070 }
12071 }
12072
12073 j = len;
12074 if (striptype != LEFTSTRIP) {
12075 j--;
12076 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012077 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012078 if (!_Py_ascii_whitespace[ch])
12079 break;
12080 j--;
12081 }
12082 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012083 }
12084 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012085 else {
12086 int kind = PyUnicode_KIND(self);
12087 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012088
Victor Stinnercc7af722013-04-09 22:39:24 +020012089 i = 0;
12090 if (striptype != RIGHTSTRIP) {
12091 while (i < len) {
12092 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12093 if (!Py_UNICODE_ISSPACE(ch))
12094 break;
12095 i++;
12096 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012097 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012098
12099 j = len;
12100 if (striptype != LEFTSTRIP) {
12101 j--;
12102 while (j >= i) {
12103 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12104 if (!Py_UNICODE_ISSPACE(ch))
12105 break;
12106 j--;
12107 }
12108 j++;
12109 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012110 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012111
Victor Stinner7931d9a2011-11-04 00:22:48 +010012112 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115
12116static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012117do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012118{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012119 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120
Serhiy Storchakac6792272013-10-19 21:03:34 +030012121 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123
Benjamin Peterson14339b62009-01-31 16:36:08 +000012124 if (sep != NULL && sep != Py_None) {
12125 if (PyUnicode_Check(sep))
12126 return _PyUnicode_XStrip(self, striptype, sep);
12127 else {
12128 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012129 "%s arg must be None or str",
12130 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012131 return NULL;
12132 }
12133 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134
Benjamin Peterson14339b62009-01-31 16:36:08 +000012135 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136}
12137
12138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012139PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012141\n\
12142Return a copy of the string S with leading and trailing\n\
12143whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012144If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012145
12146static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012147unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 if (PyTuple_GET_SIZE(args) == 0)
12150 return do_strip(self, BOTHSTRIP); /* Common case */
12151 else
12152 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012153}
12154
12155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012156PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012158\n\
12159Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012160If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012161
12162static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012163unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012165 if (PyTuple_GET_SIZE(args) == 0)
12166 return do_strip(self, LEFTSTRIP); /* Common case */
12167 else
12168 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012169}
12170
12171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012172PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012174\n\
12175Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012176If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012177
12178static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012179unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012181 if (PyTuple_GET_SIZE(args) == 0)
12182 return do_strip(self, RIGHTSTRIP); /* Common case */
12183 else
12184 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012185}
12186
12187
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012189unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012191 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193
Serhiy Storchaka05997252013-01-26 12:14:02 +020012194 if (len < 1)
12195 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196
Victor Stinnerc4b49542011-12-11 22:44:26 +010012197 /* no repeat, return original string */
12198 if (len == 1)
12199 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012200
Benjamin Petersonbac79492012-01-14 13:34:47 -050012201 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 return NULL;
12203
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012204 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012205 PyErr_SetString(PyExc_OverflowError,
12206 "repeated string is too long");
12207 return NULL;
12208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012210
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012211 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212 if (!u)
12213 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012214 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (PyUnicode_GET_LENGTH(str) == 1) {
12217 const int kind = PyUnicode_KIND(str);
12218 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012219 if (kind == PyUnicode_1BYTE_KIND) {
12220 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012221 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012222 }
12223 else if (kind == PyUnicode_2BYTE_KIND) {
12224 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012225 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012226 ucs2[n] = fill_char;
12227 } else {
12228 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12229 assert(kind == PyUnicode_4BYTE_KIND);
12230 for (n = 0; n < len; ++n)
12231 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 }
12234 else {
12235 /* number of characters copied this far */
12236 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012237 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 char *to = (char *) PyUnicode_DATA(u);
12239 Py_MEMCPY(to, PyUnicode_DATA(str),
12240 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 n = (done <= nchars-done) ? done : nchars-done;
12243 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012244 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246 }
12247
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012248 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012249 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250}
12251
Alexander Belopolsky40018472011-02-26 01:02:56 +000012252PyObject *
12253PyUnicode_Replace(PyObject *obj,
12254 PyObject *subobj,
12255 PyObject *replobj,
12256 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257{
12258 PyObject *self;
12259 PyObject *str1;
12260 PyObject *str2;
12261 PyObject *result;
12262
12263 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012264 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012267 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012268 Py_DECREF(self);
12269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 }
12271 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012272 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 Py_DECREF(self);
12274 Py_DECREF(str1);
12275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012277 if (PyUnicode_READY(self) == -1 ||
12278 PyUnicode_READY(str1) == -1 ||
12279 PyUnicode_READY(str2) == -1)
12280 result = NULL;
12281 else
12282 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 Py_DECREF(self);
12284 Py_DECREF(str1);
12285 Py_DECREF(str2);
12286 return result;
12287}
12288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012289PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012290 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291\n\
12292Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012293old replaced by new. If the optional argument count is\n\
12294given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295
12296static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 PyObject *str1;
12300 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012301 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302 PyObject *result;
12303
Martin v. Löwis18e16552006-02-15 17:27:45 +000012304 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012306 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012309 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 return NULL;
12311 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012312 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 Py_DECREF(str1);
12314 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012315 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012316 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12317 result = NULL;
12318 else
12319 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320
12321 Py_DECREF(str1);
12322 Py_DECREF(str2);
12323 return result;
12324}
12325
Alexander Belopolsky40018472011-02-26 01:02:56 +000012326static PyObject *
12327unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012329 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 Py_ssize_t isize;
12331 Py_ssize_t osize, squote, dquote, i, o;
12332 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012333 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012337 return NULL;
12338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 isize = PyUnicode_GET_LENGTH(unicode);
12340 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 /* Compute length of output, quote characters, and
12343 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012344 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 max = 127;
12346 squote = dquote = 0;
12347 ikind = PyUnicode_KIND(unicode);
12348 for (i = 0; i < isize; i++) {
12349 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012350 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012352 case '\'': squote++; break;
12353 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012355 incr = 2;
12356 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 default:
12358 /* Fast-path ASCII */
12359 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012360 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012362 ;
12363 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012366 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012368 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012370 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012372 if (osize > PY_SSIZE_T_MAX - incr) {
12373 PyErr_SetString(PyExc_OverflowError,
12374 "string is too long to generate repr");
12375 return NULL;
12376 }
12377 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 }
12379
12380 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012381 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012383 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 if (dquote)
12385 /* Both squote and dquote present. Use squote,
12386 and escape them */
12387 osize += squote;
12388 else
12389 quote = '"';
12390 }
Victor Stinner55c08782013-04-14 18:45:39 +020012391 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392
12393 repr = PyUnicode_New(osize, max);
12394 if (repr == NULL)
12395 return NULL;
12396 okind = PyUnicode_KIND(repr);
12397 odata = PyUnicode_DATA(repr);
12398
12399 PyUnicode_WRITE(okind, odata, 0, quote);
12400 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012401 if (unchanged) {
12402 _PyUnicode_FastCopyCharacters(repr, 1,
12403 unicode, 0,
12404 isize);
12405 }
12406 else {
12407 for (i = 0, o = 1; i < isize; i++) {
12408 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409
Victor Stinner55c08782013-04-14 18:45:39 +020012410 /* Escape quotes and backslashes */
12411 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012412 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012414 continue;
12415 }
12416
12417 /* Map special whitespace to '\t', \n', '\r' */
12418 if (ch == '\t') {
12419 PyUnicode_WRITE(okind, odata, o++, '\\');
12420 PyUnicode_WRITE(okind, odata, o++, 't');
12421 }
12422 else if (ch == '\n') {
12423 PyUnicode_WRITE(okind, odata, o++, '\\');
12424 PyUnicode_WRITE(okind, odata, o++, 'n');
12425 }
12426 else if (ch == '\r') {
12427 PyUnicode_WRITE(okind, odata, o++, '\\');
12428 PyUnicode_WRITE(okind, odata, o++, 'r');
12429 }
12430
12431 /* Map non-printable US ASCII to '\xhh' */
12432 else if (ch < ' ' || ch == 0x7F) {
12433 PyUnicode_WRITE(okind, odata, o++, '\\');
12434 PyUnicode_WRITE(okind, odata, o++, 'x');
12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12437 }
12438
12439 /* Copy ASCII characters as-is */
12440 else if (ch < 0x7F) {
12441 PyUnicode_WRITE(okind, odata, o++, ch);
12442 }
12443
12444 /* Non-ASCII characters */
12445 else {
12446 /* Map Unicode whitespace and control characters
12447 (categories Z* and C* except ASCII space)
12448 */
12449 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12450 PyUnicode_WRITE(okind, odata, o++, '\\');
12451 /* Map 8-bit characters to '\xhh' */
12452 if (ch <= 0xff) {
12453 PyUnicode_WRITE(okind, odata, o++, 'x');
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12456 }
12457 /* Map 16-bit characters to '\uxxxx' */
12458 else if (ch <= 0xffff) {
12459 PyUnicode_WRITE(okind, odata, o++, 'u');
12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12464 }
12465 /* Map 21-bit characters to '\U00xxxxxx' */
12466 else {
12467 PyUnicode_WRITE(okind, odata, o++, 'U');
12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12474 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12476 }
12477 }
12478 /* Copy characters as-is */
12479 else {
12480 PyUnicode_WRITE(okind, odata, o++, ch);
12481 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012482 }
12483 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012486 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012487 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488}
12489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012490PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492\n\
12493Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012494such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495arguments start and end are interpreted as in slice notation.\n\
12496\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012497Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
12499static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012502 /* initialize variables to prevent gcc warning */
12503 PyObject *substring = NULL;
12504 Py_ssize_t start = 0;
12505 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
Jesus Ceaac451502011-04-20 17:09:23 +020012508 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12509 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
Christian Heimesea71a522013-06-29 21:17:34 +020012512 if (PyUnicode_READY(self) == -1) {
12513 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012515 }
12516 if (PyUnicode_READY(substring) == -1) {
12517 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520
Victor Stinner7931d9a2011-11-04 00:22:48 +010012521 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
12523 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 if (result == -2)
12526 return NULL;
12527
Christian Heimes217cfd12007-12-02 14:31:20 +000012528 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529}
12530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012531PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012532 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012534Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
12536static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012539 /* initialize variables to prevent gcc warning */
12540 PyObject *substring = NULL;
12541 Py_ssize_t start = 0;
12542 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012543 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
Jesus Ceaac451502011-04-20 17:09:23 +020012545 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12546 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548
Christian Heimesea71a522013-06-29 21:17:34 +020012549 if (PyUnicode_READY(self) == -1) {
12550 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012552 }
12553 if (PyUnicode_READY(substring) == -1) {
12554 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557
Victor Stinner7931d9a2011-11-04 00:22:48 +010012558 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
12560 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562 if (result == -2)
12563 return NULL;
12564
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565 if (result < 0) {
12566 PyErr_SetString(PyExc_ValueError, "substring not found");
12567 return NULL;
12568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569
Christian Heimes217cfd12007-12-02 14:31:20 +000012570 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571}
12572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012573PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012574 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012576Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012577done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578
12579static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012580unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012582 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 Py_UCS4 fillchar = ' ';
12584
Victor Stinnere9a29352011-10-01 02:14:59 +020012585 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012587
Benjamin Petersonbac79492012-01-14 13:34:47 -050012588 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589 return NULL;
12590
Victor Stinnerc4b49542011-12-11 22:44:26 +010012591 if (PyUnicode_GET_LENGTH(self) >= width)
12592 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593
Victor Stinnerc4b49542011-12-11 22:44:26 +010012594 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595}
12596
Alexander Belopolsky40018472011-02-26 01:02:56 +000012597PyObject *
12598PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599{
12600 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012601
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602 s = PyUnicode_FromObject(s);
12603 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012604 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 if (sep != NULL) {
12606 sep = PyUnicode_FromObject(sep);
12607 if (sep == NULL) {
12608 Py_DECREF(s);
12609 return NULL;
12610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611 }
12612
Victor Stinner9310abb2011-10-05 00:59:23 +020012613 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
12615 Py_DECREF(s);
12616 Py_XDECREF(sep);
12617 return result;
12618}
12619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012620PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012621 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622\n\
12623Return a list of the words in S, using sep as the\n\
12624delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012625splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012626whitespace string is a separator and empty strings are\n\
12627removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628
12629static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012630unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012632 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012634 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012636 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12637 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638 return NULL;
12639
12640 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012643 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012645 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646}
12647
Thomas Wouters477c8d52006-05-27 19:21:47 +000012648PyObject *
12649PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12650{
12651 PyObject* str_obj;
12652 PyObject* sep_obj;
12653 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012654 int kind1, kind2;
12655 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012657
12658 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012659 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012661 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012662 if (!sep_obj) {
12663 Py_DECREF(str_obj);
12664 return NULL;
12665 }
12666 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12667 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012668 Py_DECREF(str_obj);
12669 return NULL;
12670 }
12671
Victor Stinner14f8f022011-10-05 20:58:25 +020012672 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 len1 = PyUnicode_GET_LENGTH(str_obj);
12675 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012676 if (kind1 < kind2 || len1 < len2) {
12677 _Py_INCREF_UNICODE_EMPTY();
12678 if (!unicode_empty)
12679 out = NULL;
12680 else {
12681 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12682 Py_DECREF(unicode_empty);
12683 }
12684 Py_DECREF(sep_obj);
12685 Py_DECREF(str_obj);
12686 return out;
12687 }
12688 buf1 = PyUnicode_DATA(str_obj);
12689 buf2 = PyUnicode_DATA(sep_obj);
12690 if (kind2 != kind1) {
12691 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12692 if (!buf2)
12693 goto onError;
12694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012696 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012698 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12699 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12700 else
12701 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 break;
12703 case PyUnicode_2BYTE_KIND:
12704 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12705 break;
12706 case PyUnicode_4BYTE_KIND:
12707 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12708 break;
12709 default:
12710 assert(0);
12711 out = 0;
12712 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012713
12714 Py_DECREF(sep_obj);
12715 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012716 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012718
12719 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 onError:
12721 Py_DECREF(sep_obj);
12722 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012723 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 PyMem_Free(buf2);
12725 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012726}
12727
12728
12729PyObject *
12730PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12731{
12732 PyObject* str_obj;
12733 PyObject* sep_obj;
12734 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012735 int kind1, kind2;
12736 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012738
12739 str_obj = PyUnicode_FromObject(str_in);
12740 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012742 sep_obj = PyUnicode_FromObject(sep_in);
12743 if (!sep_obj) {
12744 Py_DECREF(str_obj);
12745 return NULL;
12746 }
12747
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012748 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 len1 = PyUnicode_GET_LENGTH(str_obj);
12751 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012752 if (kind1 < kind2 || len1 < len2) {
12753 _Py_INCREF_UNICODE_EMPTY();
12754 if (!unicode_empty)
12755 out = NULL;
12756 else {
12757 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12758 Py_DECREF(unicode_empty);
12759 }
12760 Py_DECREF(sep_obj);
12761 Py_DECREF(str_obj);
12762 return out;
12763 }
12764 buf1 = PyUnicode_DATA(str_obj);
12765 buf2 = PyUnicode_DATA(sep_obj);
12766 if (kind2 != kind1) {
12767 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12768 if (!buf2)
12769 goto onError;
12770 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012772 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012774 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12775 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12776 else
12777 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 break;
12779 case PyUnicode_2BYTE_KIND:
12780 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12781 break;
12782 case PyUnicode_4BYTE_KIND:
12783 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12784 break;
12785 default:
12786 assert(0);
12787 out = 0;
12788 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789
12790 Py_DECREF(sep_obj);
12791 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012792 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794
12795 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 onError:
12797 Py_DECREF(sep_obj);
12798 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012799 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 PyMem_Free(buf2);
12801 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802}
12803
12804PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012807Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012809found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810
12811static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012812unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813{
Victor Stinner9310abb2011-10-05 00:59:23 +020012814 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815}
12816
12817PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012818 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012820Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012822separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823
12824static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012825unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012826{
Victor Stinner9310abb2011-10-05 00:59:23 +020012827 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828}
12829
Alexander Belopolsky40018472011-02-26 01:02:56 +000012830PyObject *
12831PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012832{
12833 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012834
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835 s = PyUnicode_FromObject(s);
12836 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012837 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 if (sep != NULL) {
12839 sep = PyUnicode_FromObject(sep);
12840 if (sep == NULL) {
12841 Py_DECREF(s);
12842 return NULL;
12843 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844 }
12845
Victor Stinner9310abb2011-10-05 00:59:23 +020012846 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012847
12848 Py_DECREF(s);
12849 Py_XDECREF(sep);
12850 return result;
12851}
12852
12853PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012854 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855\n\
12856Return a list of the words in S, using sep as the\n\
12857delimiter string, starting at the end of the string and\n\
12858working to the front. If maxsplit is given, at most maxsplit\n\
12859splits are done. If sep is not specified, any whitespace string\n\
12860is a separator.");
12861
12862static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012863unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012864{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012865 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012867 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012868
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012869 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12870 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012871 return NULL;
12872
12873 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012875 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012876 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012877 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012878 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012879}
12880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012881PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883\n\
12884Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012885Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012886is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
12888static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012889unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012891 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012892 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012894 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12895 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896 return NULL;
12897
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012898 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899}
12900
12901static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012902PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012904 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905}
12906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012907PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909\n\
12910Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012911and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912
12913static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012914unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012916 if (PyUnicode_READY(self) == -1)
12917 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012918 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919}
12920
Larry Hastings61272b72014-01-07 12:41:53 -080012921/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012922
Larry Hastings31826802013-10-19 00:09:25 -070012923@staticmethod
12924str.maketrans as unicode_maketrans
12925
12926 x: object
12927
12928 y: unicode=NULL
12929
12930 z: unicode=NULL
12931
12932 /
12933
12934Return a translation table usable for str.translate().
12935
12936If there is only one argument, it must be a dictionary mapping Unicode
12937ordinals (integers) or characters to Unicode ordinals, strings or None.
12938Character keys will be then converted to ordinals.
12939If there are two arguments, they must be strings of equal length, and
12940in the resulting dictionary, each character in x will be mapped to the
12941character at the same position in y. If there is a third argument, it
12942must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012943[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012944
Larry Hastings31826802013-10-19 00:09:25 -070012945static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012946unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012947/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012948{
Georg Brandlceee0772007-11-27 23:48:05 +000012949 PyObject *new = NULL, *key, *value;
12950 Py_ssize_t i = 0;
12951 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012952
Georg Brandlceee0772007-11-27 23:48:05 +000012953 new = PyDict_New();
12954 if (!new)
12955 return NULL;
12956 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 int x_kind, y_kind, z_kind;
12958 void *x_data, *y_data, *z_data;
12959
Georg Brandlceee0772007-11-27 23:48:05 +000012960 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012961 if (!PyUnicode_Check(x)) {
12962 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12963 "be a string if there is a second argument");
12964 goto err;
12965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012967 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12968 "arguments must have equal length");
12969 goto err;
12970 }
12971 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 x_kind = PyUnicode_KIND(x);
12973 y_kind = PyUnicode_KIND(y);
12974 x_data = PyUnicode_DATA(x);
12975 y_data = PyUnicode_DATA(y);
12976 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12977 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012978 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012979 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012980 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012981 if (!value) {
12982 Py_DECREF(key);
12983 goto err;
12984 }
Georg Brandlceee0772007-11-27 23:48:05 +000012985 res = PyDict_SetItem(new, key, value);
12986 Py_DECREF(key);
12987 Py_DECREF(value);
12988 if (res < 0)
12989 goto err;
12990 }
12991 /* create entries for deleting chars in z */
12992 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 z_kind = PyUnicode_KIND(z);
12994 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012995 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012997 if (!key)
12998 goto err;
12999 res = PyDict_SetItem(new, key, Py_None);
13000 Py_DECREF(key);
13001 if (res < 0)
13002 goto err;
13003 }
13004 }
13005 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 int kind;
13007 void *data;
13008
Georg Brandlceee0772007-11-27 23:48:05 +000013009 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013010 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013011 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13012 "to maketrans it must be a dict");
13013 goto err;
13014 }
13015 /* copy entries into the new dict, converting string keys to int keys */
13016 while (PyDict_Next(x, &i, &key, &value)) {
13017 if (PyUnicode_Check(key)) {
13018 /* convert string keys to integer keys */
13019 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013020 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013021 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13022 "table must be of length 1");
13023 goto err;
13024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 kind = PyUnicode_KIND(key);
13026 data = PyUnicode_DATA(key);
13027 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013028 if (!newkey)
13029 goto err;
13030 res = PyDict_SetItem(new, newkey, value);
13031 Py_DECREF(newkey);
13032 if (res < 0)
13033 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013034 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013035 /* just keep integer keys */
13036 if (PyDict_SetItem(new, key, value) < 0)
13037 goto err;
13038 } else {
13039 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13040 "be strings or integers");
13041 goto err;
13042 }
13043 }
13044 }
13045 return new;
13046 err:
13047 Py_DECREF(new);
13048 return NULL;
13049}
13050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013051PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053\n\
13054Return a copy of the string S, where all characters have been mapped\n\
13055through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013056Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013057Unmapped characters are left untouched. Characters mapped to None\n\
13058are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059
13060static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064}
13065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013066PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013069Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070
13071static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013072unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013074 if (PyUnicode_READY(self) == -1)
13075 return NULL;
13076 if (PyUnicode_IS_ASCII(self))
13077 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013078 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079}
13080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013081PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013084Pad a numeric string S with zeros on the left, to fill a field\n\
13085of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
13087static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013088unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013090 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013091 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013092 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 int kind;
13094 void *data;
13095 Py_UCS4 chr;
13096
Martin v. Löwis18e16552006-02-15 17:27:45 +000013097 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 return NULL;
13099
Benjamin Petersonbac79492012-01-14 13:34:47 -050013100 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102
Victor Stinnerc4b49542011-12-11 22:44:26 +010013103 if (PyUnicode_GET_LENGTH(self) >= width)
13104 return unicode_result_unchanged(self);
13105
13106 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107
13108 u = pad(self, fill, 0, '0');
13109
Walter Dörwald068325e2002-04-15 13:36:47 +000013110 if (u == NULL)
13111 return NULL;
13112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 kind = PyUnicode_KIND(u);
13114 data = PyUnicode_DATA(u);
13115 chr = PyUnicode_READ(kind, data, fill);
13116
13117 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 PyUnicode_WRITE(kind, data, 0, chr);
13120 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121 }
13122
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013123 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013124 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126
13127#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013128static PyObject *
13129unicode__decimal2ascii(PyObject *self)
13130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013132}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133#endif
13134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013135PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013138Return True if S starts with the specified prefix, False otherwise.\n\
13139With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013140With optional end, stop comparing S at that position.\n\
13141prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142
13143static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013144unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013145 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013147 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013148 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013149 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013150 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013151 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152
Jesus Ceaac451502011-04-20 17:09:23 +020013153 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013155 if (PyTuple_Check(subobj)) {
13156 Py_ssize_t i;
13157 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013158 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013159 if (substring == NULL)
13160 return NULL;
13161 result = tailmatch(self, substring, start, end, -1);
13162 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013163 if (result == -1)
13164 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013165 if (result) {
13166 Py_RETURN_TRUE;
13167 }
13168 }
13169 /* nothing matched */
13170 Py_RETURN_FALSE;
13171 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013172 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013173 if (substring == NULL) {
13174 if (PyErr_ExceptionMatches(PyExc_TypeError))
13175 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13176 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013178 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013179 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013181 if (result == -1)
13182 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013183 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184}
13185
13186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013187PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013190Return True if S ends with the specified suffix, False otherwise.\n\
13191With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013192With optional end, stop comparing S at that position.\n\
13193suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194
13195static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013196unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013200 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013201 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013202 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013203 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204
Jesus Ceaac451502011-04-20 17:09:23 +020013205 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013207 if (PyTuple_Check(subobj)) {
13208 Py_ssize_t i;
13209 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013210 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013214 result = tailmatch(self, substring, start, end, +1);
13215 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013216 if (result == -1)
13217 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013218 if (result) {
13219 Py_RETURN_TRUE;
13220 }
13221 }
13222 Py_RETURN_FALSE;
13223 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013224 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013225 if (substring == NULL) {
13226 if (PyErr_ExceptionMatches(PyExc_TypeError))
13227 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13228 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013230 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013231 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013232 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013233 if (result == -1)
13234 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013235 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236}
13237
Victor Stinner202fdca2012-05-07 12:47:02 +020013238Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013239_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013240{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013241 if (!writer->readonly)
13242 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13243 else {
13244 /* Copy-on-write mode: set buffer size to 0 so
13245 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13246 * next write. */
13247 writer->size = 0;
13248 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013249 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13250 writer->data = PyUnicode_DATA(writer->buffer);
13251 writer->kind = PyUnicode_KIND(writer->buffer);
13252}
13253
Victor Stinnerd3f08822012-05-29 12:57:52 +020013254void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013255_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013256{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013257 memset(writer, 0, sizeof(*writer));
13258#ifdef Py_DEBUG
13259 writer->kind = 5; /* invalid kind */
13260#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013261 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013262}
13263
Victor Stinnerd3f08822012-05-29 12:57:52 +020013264int
13265_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13266 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013267{
Victor Stinner6989ba02013-11-18 21:08:39 +010013268#ifdef MS_WINDOWS
13269 /* On Windows, overallocate by 50% is the best factor */
13270# define OVERALLOCATE_FACTOR 2
13271#else
13272 /* On Linux, overallocate by 25% is the best factor */
13273# define OVERALLOCATE_FACTOR 4
13274#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013275 Py_ssize_t newlen;
13276 PyObject *newbuffer;
13277
Victor Stinnerd3f08822012-05-29 12:57:52 +020013278 assert(length > 0);
13279
Victor Stinner202fdca2012-05-07 12:47:02 +020013280 if (length > PY_SSIZE_T_MAX - writer->pos) {
13281 PyErr_NoMemory();
13282 return -1;
13283 }
13284 newlen = writer->pos + length;
13285
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013286 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013287
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013289 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013290 if (writer->overallocate
13291 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13292 /* overallocate to limit the number of realloc() */
13293 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013295 if (newlen < writer->min_length)
13296 newlen = writer->min_length;
13297
Victor Stinnerd3f08822012-05-29 12:57:52 +020013298 writer->buffer = PyUnicode_New(newlen, maxchar);
13299 if (writer->buffer == NULL)
13300 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013302 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013303 if (writer->overallocate
13304 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13305 /* overallocate to limit the number of realloc() */
13306 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013307 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013308 if (newlen < writer->min_length)
13309 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013310
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013311 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013312 /* resize + widen */
13313 newbuffer = PyUnicode_New(newlen, maxchar);
13314 if (newbuffer == NULL)
13315 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013316 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13317 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013318 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013319 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013320 }
13321 else {
13322 newbuffer = resize_compact(writer->buffer, newlen);
13323 if (newbuffer == NULL)
13324 return -1;
13325 }
13326 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013327 }
13328 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013329 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013330 newbuffer = PyUnicode_New(writer->size, maxchar);
13331 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013332 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013333 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13334 writer->buffer, 0, writer->pos);
13335 Py_DECREF(writer->buffer);
13336 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013337 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013338 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013339 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013340
13341#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013342}
13343
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013344Py_LOCAL_INLINE(int)
13345_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013346{
13347 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13348 return -1;
13349 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13350 writer->pos++;
13351 return 0;
13352}
13353
13354int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013355_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13356{
13357 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13358}
13359
13360int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013361_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13362{
13363 Py_UCS4 maxchar;
13364 Py_ssize_t len;
13365
13366 if (PyUnicode_READY(str) == -1)
13367 return -1;
13368 len = PyUnicode_GET_LENGTH(str);
13369 if (len == 0)
13370 return 0;
13371 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13372 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013373 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013374 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013375 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013376 Py_INCREF(str);
13377 writer->buffer = str;
13378 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013379 writer->pos += len;
13380 return 0;
13381 }
13382 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13383 return -1;
13384 }
13385 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13386 str, 0, len);
13387 writer->pos += len;
13388 return 0;
13389}
13390
Victor Stinnere215d962012-10-06 23:03:36 +020013391int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013392_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13393 Py_ssize_t start, Py_ssize_t end)
13394{
13395 Py_UCS4 maxchar;
13396 Py_ssize_t len;
13397
13398 if (PyUnicode_READY(str) == -1)
13399 return -1;
13400
13401 assert(0 <= start);
13402 assert(end <= PyUnicode_GET_LENGTH(str));
13403 assert(start <= end);
13404
13405 if (end == 0)
13406 return 0;
13407
13408 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13409 return _PyUnicodeWriter_WriteStr(writer, str);
13410
13411 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13412 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13413 else
13414 maxchar = writer->maxchar;
13415 len = end - start;
13416
13417 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13418 return -1;
13419
13420 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13421 str, start, len);
13422 writer->pos += len;
13423 return 0;
13424}
13425
13426int
Victor Stinner4a587072013-11-19 12:54:53 +010013427_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13428 const char *ascii, Py_ssize_t len)
13429{
13430 if (len == -1)
13431 len = strlen(ascii);
13432
13433 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13434
13435 if (writer->buffer == NULL && !writer->overallocate) {
13436 PyObject *str;
13437
13438 str = _PyUnicode_FromASCII(ascii, len);
13439 if (str == NULL)
13440 return -1;
13441
13442 writer->readonly = 1;
13443 writer->buffer = str;
13444 _PyUnicodeWriter_Update(writer);
13445 writer->pos += len;
13446 return 0;
13447 }
13448
13449 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13450 return -1;
13451
13452 switch (writer->kind)
13453 {
13454 case PyUnicode_1BYTE_KIND:
13455 {
13456 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13457 Py_UCS1 *data = writer->data;
13458
13459 Py_MEMCPY(data + writer->pos, str, len);
13460 break;
13461 }
13462 case PyUnicode_2BYTE_KIND:
13463 {
13464 _PyUnicode_CONVERT_BYTES(
13465 Py_UCS1, Py_UCS2,
13466 ascii, ascii + len,
13467 (Py_UCS2 *)writer->data + writer->pos);
13468 break;
13469 }
13470 case PyUnicode_4BYTE_KIND:
13471 {
13472 _PyUnicode_CONVERT_BYTES(
13473 Py_UCS1, Py_UCS4,
13474 ascii, ascii + len,
13475 (Py_UCS4 *)writer->data + writer->pos);
13476 break;
13477 }
13478 default:
13479 assert(0);
13480 }
13481
13482 writer->pos += len;
13483 return 0;
13484}
13485
13486int
13487_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13488 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013489{
13490 Py_UCS4 maxchar;
13491
13492 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13493 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13494 return -1;
13495 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13496 writer->pos += len;
13497 return 0;
13498}
13499
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013501_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013502{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013503 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013505 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013506 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013507 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013508 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013509 str = writer->buffer;
13510 writer->buffer = NULL;
13511 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13512 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013513 }
13514 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13515 PyObject *newbuffer;
13516 newbuffer = resize_compact(writer->buffer, writer->pos);
13517 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013518 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 return NULL;
13520 }
13521 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013522 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013523 str = writer->buffer;
13524 writer->buffer = NULL;
13525 assert(_PyUnicode_CheckConsistency(str, 1));
13526 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013527}
13528
Victor Stinnerd3f08822012-05-29 12:57:52 +020013529void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013530_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013531{
13532 Py_CLEAR(writer->buffer);
13533}
13534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013536
13537PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013539\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013540Return a formatted version of S, using substitutions from args and kwargs.\n\
13541The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013542
Eric Smith27bbca62010-11-04 17:06:58 +000013543PyDoc_STRVAR(format_map__doc__,
13544 "S.format_map(mapping) -> str\n\
13545\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013546Return a formatted version of S, using substitutions from mapping.\n\
13547The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013548
Eric Smith4a7d76d2008-05-30 18:10:19 +000013549static PyObject *
13550unicode__format__(PyObject* self, PyObject* args)
13551{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013552 PyObject *format_spec;
13553 _PyUnicodeWriter writer;
13554 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013555
13556 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13557 return NULL;
13558
Victor Stinnerd3f08822012-05-29 12:57:52 +020013559 if (PyUnicode_READY(self) == -1)
13560 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013561 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013562 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13563 self, format_spec, 0,
13564 PyUnicode_GET_LENGTH(format_spec));
13565 if (ret == -1) {
13566 _PyUnicodeWriter_Dealloc(&writer);
13567 return NULL;
13568 }
13569 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013570}
13571
Eric Smith8c663262007-08-25 02:26:07 +000013572PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013574\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013575Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013576
13577static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013578unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 Py_ssize_t size;
13581
13582 /* If it's a compact object, account for base structure +
13583 character data. */
13584 if (PyUnicode_IS_COMPACT_ASCII(v))
13585 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13586 else if (PyUnicode_IS_COMPACT(v))
13587 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013588 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013589 else {
13590 /* If it is a two-block object, account for base object, and
13591 for character block if present. */
13592 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013593 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013594 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013595 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 }
13597 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013598 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013599 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013600 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013601 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013602 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013603
13604 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013605}
13606
13607PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013609
13610static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013611unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013612{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013613 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 if (!copy)
13615 return NULL;
13616 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013617}
13618
Guido van Rossumd57fd912000-03-10 22:53:23 +000013619static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013620 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013621 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013622 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13623 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013624 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13625 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013626 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013627 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13628 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13629 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013630 {"expandtabs", (PyCFunction) unicode_expandtabs,
13631 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013632 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013633 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013634 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13635 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13636 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013637 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013638 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13639 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13640 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013641 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013642 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013643 {"splitlines", (PyCFunction) unicode_splitlines,
13644 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013645 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013646 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13647 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13648 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13649 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13650 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13651 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13652 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13653 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13654 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13655 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13656 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13657 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13658 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13659 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013660 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013661 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013662 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013663 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013664 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013665 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013666 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013667 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013668#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013669 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013670 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013671#endif
13672
Benjamin Peterson14339b62009-01-31 16:36:08 +000013673 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674 {NULL, NULL}
13675};
13676
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013677static PyObject *
13678unicode_mod(PyObject *v, PyObject *w)
13679{
Brian Curtindfc80e32011-08-10 20:28:54 -050013680 if (!PyUnicode_Check(v))
13681 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013683}
13684
13685static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013686 0, /*nb_add*/
13687 0, /*nb_subtract*/
13688 0, /*nb_multiply*/
13689 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013690};
13691
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013693 (lenfunc) unicode_length, /* sq_length */
13694 PyUnicode_Concat, /* sq_concat */
13695 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13696 (ssizeargfunc) unicode_getitem, /* sq_item */
13697 0, /* sq_slice */
13698 0, /* sq_ass_item */
13699 0, /* sq_ass_slice */
13700 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701};
13702
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013703static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013704unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706 if (PyUnicode_READY(self) == -1)
13707 return NULL;
13708
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013709 if (PyIndex_Check(item)) {
13710 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013711 if (i == -1 && PyErr_Occurred())
13712 return NULL;
13713 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013714 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013715 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013716 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013717 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013718 PyObject *result;
13719 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013720 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013721 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013723 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013725 return NULL;
13726 }
13727
13728 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013729 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013730 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013731 slicelength == PyUnicode_GET_LENGTH(self)) {
13732 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013733 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013734 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013735 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013736 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013737 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013738 src_kind = PyUnicode_KIND(self);
13739 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013740 if (!PyUnicode_IS_ASCII(self)) {
13741 kind_limit = kind_maxchar_limit(src_kind);
13742 max_char = 0;
13743 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13744 ch = PyUnicode_READ(src_kind, src_data, cur);
13745 if (ch > max_char) {
13746 max_char = ch;
13747 if (max_char >= kind_limit)
13748 break;
13749 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013750 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013751 }
Victor Stinner55c99112011-10-13 01:17:06 +020013752 else
13753 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013754 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013755 if (result == NULL)
13756 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013757 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013758 dest_data = PyUnicode_DATA(result);
13759
13760 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013761 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13762 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013763 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013764 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013765 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013766 } else {
13767 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13768 return NULL;
13769 }
13770}
13771
13772static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013773 (lenfunc)unicode_length, /* mp_length */
13774 (binaryfunc)unicode_subscript, /* mp_subscript */
13775 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013776};
13777
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779/* Helpers for PyUnicode_Format() */
13780
Victor Stinnera47082312012-10-04 02:19:54 +020013781struct unicode_formatter_t {
13782 PyObject *args;
13783 int args_owned;
13784 Py_ssize_t arglen, argidx;
13785 PyObject *dict;
13786
13787 enum PyUnicode_Kind fmtkind;
13788 Py_ssize_t fmtcnt, fmtpos;
13789 void *fmtdata;
13790 PyObject *fmtstr;
13791
13792 _PyUnicodeWriter writer;
13793};
13794
13795struct unicode_format_arg_t {
13796 Py_UCS4 ch;
13797 int flags;
13798 Py_ssize_t width;
13799 int prec;
13800 int sign;
13801};
13802
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013804unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013805{
Victor Stinnera47082312012-10-04 02:19:54 +020013806 Py_ssize_t argidx = ctx->argidx;
13807
13808 if (argidx < ctx->arglen) {
13809 ctx->argidx++;
13810 if (ctx->arglen < 0)
13811 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013812 else
Victor Stinnera47082312012-10-04 02:19:54 +020013813 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814 }
13815 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013816 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817 return NULL;
13818}
13819
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013820/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013821
Victor Stinnera47082312012-10-04 02:19:54 +020013822/* Format a float into the writer if the writer is not NULL, or into *p_output
13823 otherwise.
13824
13825 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013826static int
Victor Stinnera47082312012-10-04 02:19:54 +020013827formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13828 PyObject **p_output,
13829 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013831 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013833 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013834 int prec;
13835 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013836
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837 x = PyFloat_AsDouble(v);
13838 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013839 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013840
Victor Stinnera47082312012-10-04 02:19:54 +020013841 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013844
Victor Stinnera47082312012-10-04 02:19:54 +020013845 if (arg->flags & F_ALT)
13846 dtoa_flags = Py_DTSF_ALT;
13847 else
13848 dtoa_flags = 0;
13849 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013850 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013851 return -1;
13852 len = strlen(p);
13853 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013854 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013855 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013857 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013858 }
13859 else
13860 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013861 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013862 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013863}
13864
Victor Stinnerd0880d52012-04-27 23:40:13 +020013865/* formatlong() emulates the format codes d, u, o, x and X, and
13866 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13867 * Python's regular ints.
13868 * Return value: a new PyUnicodeObject*, or NULL if error.
13869 * The output string is of the form
13870 * "-"? ("0x" | "0X")? digit+
13871 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13872 * set in flags. The case of hex digits will be correct,
13873 * There will be at least prec digits, zero-filled on the left if
13874 * necessary to get that many.
13875 * val object to be converted
13876 * flags bitmask of format flags; only F_ALT is looked at
13877 * prec minimum number of digits; 0-fill on left if needed
13878 * type a character in [duoxX]; u acts the same as d
13879 *
13880 * CAUTION: o, x and X conversions on regular ints can never
13881 * produce a '-' sign, but can for Python's unbounded ints.
13882 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013883PyObject *
13884_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013885{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013886 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013887 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013888 Py_ssize_t i;
13889 int sign; /* 1 if '-', else 0 */
13890 int len; /* number of characters */
13891 Py_ssize_t llen;
13892 int numdigits; /* len == numnondigits + numdigits */
13893 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013894
Victor Stinnerd0880d52012-04-27 23:40:13 +020013895 /* Avoid exceeding SSIZE_T_MAX */
13896 if (prec > INT_MAX-3) {
13897 PyErr_SetString(PyExc_OverflowError,
13898 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013899 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013900 }
13901
13902 assert(PyLong_Check(val));
13903
13904 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013905 default:
13906 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013907 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013908 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013909 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013910 /* int and int subclasses should print numerically when a numeric */
13911 /* format code is used (see issue18780) */
13912 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013913 break;
13914 case 'o':
13915 numnondigits = 2;
13916 result = PyNumber_ToBase(val, 8);
13917 break;
13918 case 'x':
13919 case 'X':
13920 numnondigits = 2;
13921 result = PyNumber_ToBase(val, 16);
13922 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013923 }
13924 if (!result)
13925 return NULL;
13926
13927 assert(unicode_modifiable(result));
13928 assert(PyUnicode_IS_READY(result));
13929 assert(PyUnicode_IS_ASCII(result));
13930
13931 /* To modify the string in-place, there can only be one reference. */
13932 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013933 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013934 PyErr_BadInternalCall();
13935 return NULL;
13936 }
13937 buf = PyUnicode_DATA(result);
13938 llen = PyUnicode_GET_LENGTH(result);
13939 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013940 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013941 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013942 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013943 return NULL;
13944 }
13945 len = (int)llen;
13946 sign = buf[0] == '-';
13947 numnondigits += sign;
13948 numdigits = len - numnondigits;
13949 assert(numdigits > 0);
13950
13951 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013952 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013953 (type == 'o' || type == 'x' || type == 'X'))) {
13954 assert(buf[sign] == '0');
13955 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13956 buf[sign+1] == 'o');
13957 numnondigits -= 2;
13958 buf += 2;
13959 len -= 2;
13960 if (sign)
13961 buf[0] = '-';
13962 assert(len == numnondigits + numdigits);
13963 assert(numdigits > 0);
13964 }
13965
13966 /* Fill with leading zeroes to meet minimum width. */
13967 if (prec > numdigits) {
13968 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13969 numnondigits + prec);
13970 char *b1;
13971 if (!r1) {
13972 Py_DECREF(result);
13973 return NULL;
13974 }
13975 b1 = PyBytes_AS_STRING(r1);
13976 for (i = 0; i < numnondigits; ++i)
13977 *b1++ = *buf++;
13978 for (i = 0; i < prec - numdigits; i++)
13979 *b1++ = '0';
13980 for (i = 0; i < numdigits; i++)
13981 *b1++ = *buf++;
13982 *b1 = '\0';
13983 Py_DECREF(result);
13984 result = r1;
13985 buf = PyBytes_AS_STRING(result);
13986 len = numnondigits + prec;
13987 }
13988
13989 /* Fix up case for hex conversions. */
13990 if (type == 'X') {
13991 /* Need to convert all lower case letters to upper case.
13992 and need to convert 0x to 0X (and -0x to -0X). */
13993 for (i = 0; i < len; i++)
13994 if (buf[i] >= 'a' && buf[i] <= 'x')
13995 buf[i] -= 'a'-'A';
13996 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013997 if (!PyUnicode_Check(result)
13998 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013999 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014000 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014001 Py_DECREF(result);
14002 result = unicode;
14003 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014004 else if (len != PyUnicode_GET_LENGTH(result)) {
14005 if (PyUnicode_Resize(&result, len) < 0)
14006 Py_CLEAR(result);
14007 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014008 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014009}
14010
Ethan Furmandf3ed242014-01-05 06:50:30 -080014011/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014012 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014013 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014014 * -1 and raise an exception on error */
14015static int
Victor Stinnera47082312012-10-04 02:19:54 +020014016mainformatlong(PyObject *v,
14017 struct unicode_format_arg_t *arg,
14018 PyObject **p_output,
14019 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014020{
14021 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014022 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014023
14024 if (!PyNumber_Check(v))
14025 goto wrongtype;
14026
Ethan Furman9ab74802014-03-21 06:38:46 -070014027 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014028 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014029 if (type == 'o' || type == 'x' || type == 'X') {
14030 iobj = PyNumber_Index(v);
14031 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014032 if (PyErr_ExceptionMatches(PyExc_TypeError))
14033 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014034 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014035 }
14036 }
14037 else {
14038 iobj = PyNumber_Long(v);
14039 if (iobj == NULL ) {
14040 if (PyErr_ExceptionMatches(PyExc_TypeError))
14041 goto wrongtype;
14042 return -1;
14043 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014044 }
14045 assert(PyLong_Check(iobj));
14046 }
14047 else {
14048 iobj = v;
14049 Py_INCREF(iobj);
14050 }
14051
14052 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014053 && arg->width == -1 && arg->prec == -1
14054 && !(arg->flags & (F_SIGN | F_BLANK))
14055 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014056 {
14057 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014058 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014059 int base;
14060
Victor Stinnera47082312012-10-04 02:19:54 +020014061 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014062 {
14063 default:
14064 assert(0 && "'type' not in [diuoxX]");
14065 case 'd':
14066 case 'i':
14067 case 'u':
14068 base = 10;
14069 break;
14070 case 'o':
14071 base = 8;
14072 break;
14073 case 'x':
14074 case 'X':
14075 base = 16;
14076 break;
14077 }
14078
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014079 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14080 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014081 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014082 }
14083 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014084 return 1;
14085 }
14086
Ethan Furmanb95b5612015-01-23 20:05:18 -080014087 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014088 Py_DECREF(iobj);
14089 if (res == NULL)
14090 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014091 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014092 return 0;
14093
14094wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014095 switch(type)
14096 {
14097 case 'o':
14098 case 'x':
14099 case 'X':
14100 PyErr_Format(PyExc_TypeError,
14101 "%%%c format: an integer is required, "
14102 "not %.200s",
14103 type, Py_TYPE(v)->tp_name);
14104 break;
14105 default:
14106 PyErr_Format(PyExc_TypeError,
14107 "%%%c format: a number is required, "
14108 "not %.200s",
14109 type, Py_TYPE(v)->tp_name);
14110 break;
14111 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014112 return -1;
14113}
14114
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014115static Py_UCS4
14116formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014117{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014118 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014119 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014120 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014121 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014122 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014123 goto onError;
14124 }
14125 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014126 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014127 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014128 /* make sure number is a type of integer */
14129 if (!PyLong_Check(v)) {
14130 iobj = PyNumber_Index(v);
14131 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014132 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014133 }
14134 v = iobj;
14135 Py_DECREF(iobj);
14136 }
14137 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014138 x = PyLong_AsLong(v);
14139 if (x == -1 && PyErr_Occurred())
14140 goto onError;
14141
Victor Stinner8faf8212011-12-08 22:14:11 +010014142 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014143 PyErr_SetString(PyExc_OverflowError,
14144 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014145 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014146 }
14147
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014148 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014149 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014150
Benjamin Peterson29060642009-01-31 22:14:21 +000014151 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014152 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014153 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014154 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014155}
14156
Victor Stinnera47082312012-10-04 02:19:54 +020014157/* Parse options of an argument: flags, width, precision.
14158 Handle also "%(name)" syntax.
14159
14160 Return 0 if the argument has been formatted into arg->str.
14161 Return 1 if the argument has been written into ctx->writer,
14162 Raise an exception and return -1 on error. */
14163static int
14164unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14165 struct unicode_format_arg_t *arg)
14166{
14167#define FORMAT_READ(ctx) \
14168 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14169
14170 PyObject *v;
14171
Victor Stinnera47082312012-10-04 02:19:54 +020014172 if (arg->ch == '(') {
14173 /* Get argument value from a dictionary. Example: "%(name)s". */
14174 Py_ssize_t keystart;
14175 Py_ssize_t keylen;
14176 PyObject *key;
14177 int pcount = 1;
14178
14179 if (ctx->dict == NULL) {
14180 PyErr_SetString(PyExc_TypeError,
14181 "format requires a mapping");
14182 return -1;
14183 }
14184 ++ctx->fmtpos;
14185 --ctx->fmtcnt;
14186 keystart = ctx->fmtpos;
14187 /* Skip over balanced parentheses */
14188 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14189 arg->ch = FORMAT_READ(ctx);
14190 if (arg->ch == ')')
14191 --pcount;
14192 else if (arg->ch == '(')
14193 ++pcount;
14194 ctx->fmtpos++;
14195 }
14196 keylen = ctx->fmtpos - keystart - 1;
14197 if (ctx->fmtcnt < 0 || pcount > 0) {
14198 PyErr_SetString(PyExc_ValueError,
14199 "incomplete format key");
14200 return -1;
14201 }
14202 key = PyUnicode_Substring(ctx->fmtstr,
14203 keystart, keystart + keylen);
14204 if (key == NULL)
14205 return -1;
14206 if (ctx->args_owned) {
14207 Py_DECREF(ctx->args);
14208 ctx->args_owned = 0;
14209 }
14210 ctx->args = PyObject_GetItem(ctx->dict, key);
14211 Py_DECREF(key);
14212 if (ctx->args == NULL)
14213 return -1;
14214 ctx->args_owned = 1;
14215 ctx->arglen = -1;
14216 ctx->argidx = -2;
14217 }
14218
14219 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014220 while (--ctx->fmtcnt >= 0) {
14221 arg->ch = FORMAT_READ(ctx);
14222 ctx->fmtpos++;
14223 switch (arg->ch) {
14224 case '-': arg->flags |= F_LJUST; continue;
14225 case '+': arg->flags |= F_SIGN; continue;
14226 case ' ': arg->flags |= F_BLANK; continue;
14227 case '#': arg->flags |= F_ALT; continue;
14228 case '0': arg->flags |= F_ZERO; continue;
14229 }
14230 break;
14231 }
14232
14233 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014234 if (arg->ch == '*') {
14235 v = unicode_format_getnextarg(ctx);
14236 if (v == NULL)
14237 return -1;
14238 if (!PyLong_Check(v)) {
14239 PyErr_SetString(PyExc_TypeError,
14240 "* wants int");
14241 return -1;
14242 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014243 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014244 if (arg->width == -1 && PyErr_Occurred())
14245 return -1;
14246 if (arg->width < 0) {
14247 arg->flags |= F_LJUST;
14248 arg->width = -arg->width;
14249 }
14250 if (--ctx->fmtcnt >= 0) {
14251 arg->ch = FORMAT_READ(ctx);
14252 ctx->fmtpos++;
14253 }
14254 }
14255 else if (arg->ch >= '0' && arg->ch <= '9') {
14256 arg->width = arg->ch - '0';
14257 while (--ctx->fmtcnt >= 0) {
14258 arg->ch = FORMAT_READ(ctx);
14259 ctx->fmtpos++;
14260 if (arg->ch < '0' || arg->ch > '9')
14261 break;
14262 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14263 mixing signed and unsigned comparison. Since arg->ch is between
14264 '0' and '9', casting to int is safe. */
14265 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14266 PyErr_SetString(PyExc_ValueError,
14267 "width too big");
14268 return -1;
14269 }
14270 arg->width = arg->width*10 + (arg->ch - '0');
14271 }
14272 }
14273
14274 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014275 if (arg->ch == '.') {
14276 arg->prec = 0;
14277 if (--ctx->fmtcnt >= 0) {
14278 arg->ch = FORMAT_READ(ctx);
14279 ctx->fmtpos++;
14280 }
14281 if (arg->ch == '*') {
14282 v = unicode_format_getnextarg(ctx);
14283 if (v == NULL)
14284 return -1;
14285 if (!PyLong_Check(v)) {
14286 PyErr_SetString(PyExc_TypeError,
14287 "* wants int");
14288 return -1;
14289 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014290 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014291 if (arg->prec == -1 && PyErr_Occurred())
14292 return -1;
14293 if (arg->prec < 0)
14294 arg->prec = 0;
14295 if (--ctx->fmtcnt >= 0) {
14296 arg->ch = FORMAT_READ(ctx);
14297 ctx->fmtpos++;
14298 }
14299 }
14300 else if (arg->ch >= '0' && arg->ch <= '9') {
14301 arg->prec = arg->ch - '0';
14302 while (--ctx->fmtcnt >= 0) {
14303 arg->ch = FORMAT_READ(ctx);
14304 ctx->fmtpos++;
14305 if (arg->ch < '0' || arg->ch > '9')
14306 break;
14307 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14308 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014309 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014310 return -1;
14311 }
14312 arg->prec = arg->prec*10 + (arg->ch - '0');
14313 }
14314 }
14315 }
14316
14317 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14318 if (ctx->fmtcnt >= 0) {
14319 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14320 if (--ctx->fmtcnt >= 0) {
14321 arg->ch = FORMAT_READ(ctx);
14322 ctx->fmtpos++;
14323 }
14324 }
14325 }
14326 if (ctx->fmtcnt < 0) {
14327 PyErr_SetString(PyExc_ValueError,
14328 "incomplete format");
14329 return -1;
14330 }
14331 return 0;
14332
14333#undef FORMAT_READ
14334}
14335
14336/* Format one argument. Supported conversion specifiers:
14337
14338 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014339 - "i", "d", "u": int or float
14340 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014341 - "e", "E", "f", "F", "g", "G": float
14342 - "c": int or str (1 character)
14343
Victor Stinner8dbd4212012-12-04 09:30:24 +010014344 When possible, the output is written directly into the Unicode writer
14345 (ctx->writer). A string is created when padding is required.
14346
Victor Stinnera47082312012-10-04 02:19:54 +020014347 Return 0 if the argument has been formatted into *p_str,
14348 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014349 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014350static int
14351unicode_format_arg_format(struct unicode_formatter_t *ctx,
14352 struct unicode_format_arg_t *arg,
14353 PyObject **p_str)
14354{
14355 PyObject *v;
14356 _PyUnicodeWriter *writer = &ctx->writer;
14357
14358 if (ctx->fmtcnt == 0)
14359 ctx->writer.overallocate = 0;
14360
14361 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014362 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014363 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014364 return 1;
14365 }
14366
14367 v = unicode_format_getnextarg(ctx);
14368 if (v == NULL)
14369 return -1;
14370
Victor Stinnera47082312012-10-04 02:19:54 +020014371
14372 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014373 case 's':
14374 case 'r':
14375 case 'a':
14376 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14377 /* Fast path */
14378 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14379 return -1;
14380 return 1;
14381 }
14382
14383 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14384 *p_str = v;
14385 Py_INCREF(*p_str);
14386 }
14387 else {
14388 if (arg->ch == 's')
14389 *p_str = PyObject_Str(v);
14390 else if (arg->ch == 'r')
14391 *p_str = PyObject_Repr(v);
14392 else
14393 *p_str = PyObject_ASCII(v);
14394 }
14395 break;
14396
14397 case 'i':
14398 case 'd':
14399 case 'u':
14400 case 'o':
14401 case 'x':
14402 case 'X':
14403 {
14404 int ret = mainformatlong(v, arg, p_str, writer);
14405 if (ret != 0)
14406 return ret;
14407 arg->sign = 1;
14408 break;
14409 }
14410
14411 case 'e':
14412 case 'E':
14413 case 'f':
14414 case 'F':
14415 case 'g':
14416 case 'G':
14417 if (arg->width == -1 && arg->prec == -1
14418 && !(arg->flags & (F_SIGN | F_BLANK)))
14419 {
14420 /* Fast path */
14421 if (formatfloat(v, arg, NULL, writer) == -1)
14422 return -1;
14423 return 1;
14424 }
14425
14426 arg->sign = 1;
14427 if (formatfloat(v, arg, p_str, NULL) == -1)
14428 return -1;
14429 break;
14430
14431 case 'c':
14432 {
14433 Py_UCS4 ch = formatchar(v);
14434 if (ch == (Py_UCS4) -1)
14435 return -1;
14436 if (arg->width == -1 && arg->prec == -1) {
14437 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014438 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014439 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014440 return 1;
14441 }
14442 *p_str = PyUnicode_FromOrdinal(ch);
14443 break;
14444 }
14445
14446 default:
14447 PyErr_Format(PyExc_ValueError,
14448 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014449 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014450 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14451 (int)arg->ch,
14452 ctx->fmtpos - 1);
14453 return -1;
14454 }
14455 if (*p_str == NULL)
14456 return -1;
14457 assert (PyUnicode_Check(*p_str));
14458 return 0;
14459}
14460
14461static int
14462unicode_format_arg_output(struct unicode_formatter_t *ctx,
14463 struct unicode_format_arg_t *arg,
14464 PyObject *str)
14465{
14466 Py_ssize_t len;
14467 enum PyUnicode_Kind kind;
14468 void *pbuf;
14469 Py_ssize_t pindex;
14470 Py_UCS4 signchar;
14471 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014472 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014473 Py_ssize_t sublen;
14474 _PyUnicodeWriter *writer = &ctx->writer;
14475 Py_UCS4 fill;
14476
14477 fill = ' ';
14478 if (arg->sign && arg->flags & F_ZERO)
14479 fill = '0';
14480
14481 if (PyUnicode_READY(str) == -1)
14482 return -1;
14483
14484 len = PyUnicode_GET_LENGTH(str);
14485 if ((arg->width == -1 || arg->width <= len)
14486 && (arg->prec == -1 || arg->prec >= len)
14487 && !(arg->flags & (F_SIGN | F_BLANK)))
14488 {
14489 /* Fast path */
14490 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14491 return -1;
14492 return 0;
14493 }
14494
14495 /* Truncate the string for "s", "r" and "a" formats
14496 if the precision is set */
14497 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14498 if (arg->prec >= 0 && len > arg->prec)
14499 len = arg->prec;
14500 }
14501
14502 /* Adjust sign and width */
14503 kind = PyUnicode_KIND(str);
14504 pbuf = PyUnicode_DATA(str);
14505 pindex = 0;
14506 signchar = '\0';
14507 if (arg->sign) {
14508 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14509 if (ch == '-' || ch == '+') {
14510 signchar = ch;
14511 len--;
14512 pindex++;
14513 }
14514 else if (arg->flags & F_SIGN)
14515 signchar = '+';
14516 else if (arg->flags & F_BLANK)
14517 signchar = ' ';
14518 else
14519 arg->sign = 0;
14520 }
14521 if (arg->width < len)
14522 arg->width = len;
14523
14524 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014525 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014526 if (!(arg->flags & F_LJUST)) {
14527 if (arg->sign) {
14528 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014529 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014530 }
14531 else {
14532 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014533 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014534 }
14535 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014536 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14537 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014538 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014539 }
14540
Victor Stinnera47082312012-10-04 02:19:54 +020014541 buflen = arg->width;
14542 if (arg->sign && len == arg->width)
14543 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014544 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014545 return -1;
14546
14547 /* Write the sign if needed */
14548 if (arg->sign) {
14549 if (fill != ' ') {
14550 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14551 writer->pos += 1;
14552 }
14553 if (arg->width > len)
14554 arg->width--;
14555 }
14556
14557 /* Write the numeric prefix for "x", "X" and "o" formats
14558 if the alternate form is used.
14559 For example, write "0x" for the "%#x" format. */
14560 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14561 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14562 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14563 if (fill != ' ') {
14564 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14565 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14566 writer->pos += 2;
14567 pindex += 2;
14568 }
14569 arg->width -= 2;
14570 if (arg->width < 0)
14571 arg->width = 0;
14572 len -= 2;
14573 }
14574
14575 /* Pad left with the fill character if needed */
14576 if (arg->width > len && !(arg->flags & F_LJUST)) {
14577 sublen = arg->width - len;
14578 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14579 writer->pos += sublen;
14580 arg->width = len;
14581 }
14582
14583 /* If padding with spaces: write sign if needed and/or numeric prefix if
14584 the alternate form is used */
14585 if (fill == ' ') {
14586 if (arg->sign) {
14587 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14588 writer->pos += 1;
14589 }
14590 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14591 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14592 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14593 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14594 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14595 writer->pos += 2;
14596 pindex += 2;
14597 }
14598 }
14599
14600 /* Write characters */
14601 if (len) {
14602 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14603 str, pindex, len);
14604 writer->pos += len;
14605 }
14606
14607 /* Pad right with the fill character if needed */
14608 if (arg->width > len) {
14609 sublen = arg->width - len;
14610 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14611 writer->pos += sublen;
14612 }
14613 return 0;
14614}
14615
14616/* Helper of PyUnicode_Format(): format one arg.
14617 Return 0 on success, raise an exception and return -1 on error. */
14618static int
14619unicode_format_arg(struct unicode_formatter_t *ctx)
14620{
14621 struct unicode_format_arg_t arg;
14622 PyObject *str;
14623 int ret;
14624
Victor Stinner8dbd4212012-12-04 09:30:24 +010014625 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14626 arg.flags = 0;
14627 arg.width = -1;
14628 arg.prec = -1;
14629 arg.sign = 0;
14630 str = NULL;
14631
Victor Stinnera47082312012-10-04 02:19:54 +020014632 ret = unicode_format_arg_parse(ctx, &arg);
14633 if (ret == -1)
14634 return -1;
14635
14636 ret = unicode_format_arg_format(ctx, &arg, &str);
14637 if (ret == -1)
14638 return -1;
14639
14640 if (ret != 1) {
14641 ret = unicode_format_arg_output(ctx, &arg, str);
14642 Py_DECREF(str);
14643 if (ret == -1)
14644 return -1;
14645 }
14646
14647 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14648 PyErr_SetString(PyExc_TypeError,
14649 "not all arguments converted during string formatting");
14650 return -1;
14651 }
14652 return 0;
14653}
14654
Alexander Belopolsky40018472011-02-26 01:02:56 +000014655PyObject *
14656PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014657{
Victor Stinnera47082312012-10-04 02:19:54 +020014658 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014659
Guido van Rossumd57fd912000-03-10 22:53:23 +000014660 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014661 PyErr_BadInternalCall();
14662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014663 }
Victor Stinnera47082312012-10-04 02:19:54 +020014664
14665 ctx.fmtstr = PyUnicode_FromObject(format);
14666 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014667 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014668 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14669 Py_DECREF(ctx.fmtstr);
14670 return NULL;
14671 }
14672 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14673 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14674 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14675 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014676
Victor Stinner8f674cc2013-04-17 23:02:17 +020014677 _PyUnicodeWriter_Init(&ctx.writer);
14678 ctx.writer.min_length = ctx.fmtcnt + 100;
14679 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014680
Guido van Rossumd57fd912000-03-10 22:53:23 +000014681 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014682 ctx.arglen = PyTuple_Size(args);
14683 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014684 }
14685 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014686 ctx.arglen = -1;
14687 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014688 }
Victor Stinnera47082312012-10-04 02:19:54 +020014689 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014690 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014691 ctx.dict = args;
14692 else
14693 ctx.dict = NULL;
14694 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014695
Victor Stinnera47082312012-10-04 02:19:54 +020014696 while (--ctx.fmtcnt >= 0) {
14697 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014698 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014699
14700 nonfmtpos = ctx.fmtpos++;
14701 while (ctx.fmtcnt >= 0 &&
14702 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14703 ctx.fmtpos++;
14704 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014705 }
Victor Stinnera47082312012-10-04 02:19:54 +020014706 if (ctx.fmtcnt < 0) {
14707 ctx.fmtpos--;
14708 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014709 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014710
Victor Stinnercfc4c132013-04-03 01:48:39 +020014711 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14712 nonfmtpos, ctx.fmtpos) < 0)
14713 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014714 }
14715 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014716 ctx.fmtpos++;
14717 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014718 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014719 }
14720 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014721
Victor Stinnera47082312012-10-04 02:19:54 +020014722 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014723 PyErr_SetString(PyExc_TypeError,
14724 "not all arguments converted during string formatting");
14725 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014726 }
14727
Victor Stinnera47082312012-10-04 02:19:54 +020014728 if (ctx.args_owned) {
14729 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014730 }
Victor Stinnera47082312012-10-04 02:19:54 +020014731 Py_DECREF(ctx.fmtstr);
14732 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014733
Benjamin Peterson29060642009-01-31 22:14:21 +000014734 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014735 Py_DECREF(ctx.fmtstr);
14736 _PyUnicodeWriter_Dealloc(&ctx.writer);
14737 if (ctx.args_owned) {
14738 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014739 }
14740 return NULL;
14741}
14742
Jeremy Hylton938ace62002-07-17 16:30:39 +000014743static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014744unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14745
Tim Peters6d6c1a32001-08-02 04:15:00 +000014746static PyObject *
14747unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14748{
Benjamin Peterson29060642009-01-31 22:14:21 +000014749 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014750 static char *kwlist[] = {"object", "encoding", "errors", 0};
14751 char *encoding = NULL;
14752 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014753
Benjamin Peterson14339b62009-01-31 16:36:08 +000014754 if (type != &PyUnicode_Type)
14755 return unicode_subtype_new(type, args, kwds);
14756 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014757 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014758 return NULL;
14759 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014760 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014761 if (encoding == NULL && errors == NULL)
14762 return PyObject_Str(x);
14763 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014764 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014765}
14766
Guido van Rossume023fe02001-08-30 03:12:59 +000014767static PyObject *
14768unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14769{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014770 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014771 Py_ssize_t length, char_size;
14772 int share_wstr, share_utf8;
14773 unsigned int kind;
14774 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014775
Benjamin Peterson14339b62009-01-31 16:36:08 +000014776 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014777
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014778 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014779 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014780 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014781 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014782 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014783 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014784 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014785 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014786
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014787 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014788 if (self == NULL) {
14789 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014790 return NULL;
14791 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014792 kind = PyUnicode_KIND(unicode);
14793 length = PyUnicode_GET_LENGTH(unicode);
14794
14795 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014796#ifdef Py_DEBUG
14797 _PyUnicode_HASH(self) = -1;
14798#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014799 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014800#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014801 _PyUnicode_STATE(self).interned = 0;
14802 _PyUnicode_STATE(self).kind = kind;
14803 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014804 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014805 _PyUnicode_STATE(self).ready = 1;
14806 _PyUnicode_WSTR(self) = NULL;
14807 _PyUnicode_UTF8_LENGTH(self) = 0;
14808 _PyUnicode_UTF8(self) = NULL;
14809 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014810 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014811
14812 share_utf8 = 0;
14813 share_wstr = 0;
14814 if (kind == PyUnicode_1BYTE_KIND) {
14815 char_size = 1;
14816 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14817 share_utf8 = 1;
14818 }
14819 else if (kind == PyUnicode_2BYTE_KIND) {
14820 char_size = 2;
14821 if (sizeof(wchar_t) == 2)
14822 share_wstr = 1;
14823 }
14824 else {
14825 assert(kind == PyUnicode_4BYTE_KIND);
14826 char_size = 4;
14827 if (sizeof(wchar_t) == 4)
14828 share_wstr = 1;
14829 }
14830
14831 /* Ensure we won't overflow the length. */
14832 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14833 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014834 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014835 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014836 data = PyObject_MALLOC((length + 1) * char_size);
14837 if (data == NULL) {
14838 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014839 goto onError;
14840 }
14841
Victor Stinnerc3c74152011-10-02 20:39:55 +020014842 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014843 if (share_utf8) {
14844 _PyUnicode_UTF8_LENGTH(self) = length;
14845 _PyUnicode_UTF8(self) = data;
14846 }
14847 if (share_wstr) {
14848 _PyUnicode_WSTR_LENGTH(self) = length;
14849 _PyUnicode_WSTR(self) = (wchar_t *)data;
14850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014851
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014852 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014853 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014854 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014855#ifdef Py_DEBUG
14856 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14857#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014858 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014859 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014860
14861onError:
14862 Py_DECREF(unicode);
14863 Py_DECREF(self);
14864 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014865}
14866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014867PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014868"str(object='') -> str\n\
14869str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014870\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014871Create a new string object from the given object. If encoding or\n\
14872errors is specified, then the object must expose a data buffer\n\
14873that will be decoded using the given encoding and error handler.\n\
14874Otherwise, returns the result of object.__str__() (if defined)\n\
14875or repr(object).\n\
14876encoding defaults to sys.getdefaultencoding().\n\
14877errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014878
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014879static PyObject *unicode_iter(PyObject *seq);
14880
Guido van Rossumd57fd912000-03-10 22:53:23 +000014881PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014882 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014883 "str", /* tp_name */
14884 sizeof(PyUnicodeObject), /* tp_size */
14885 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014886 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014887 (destructor)unicode_dealloc, /* tp_dealloc */
14888 0, /* tp_print */
14889 0, /* tp_getattr */
14890 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014891 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014892 unicode_repr, /* tp_repr */
14893 &unicode_as_number, /* tp_as_number */
14894 &unicode_as_sequence, /* tp_as_sequence */
14895 &unicode_as_mapping, /* tp_as_mapping */
14896 (hashfunc) unicode_hash, /* tp_hash*/
14897 0, /* tp_call*/
14898 (reprfunc) unicode_str, /* tp_str */
14899 PyObject_GenericGetAttr, /* tp_getattro */
14900 0, /* tp_setattro */
14901 0, /* tp_as_buffer */
14902 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014903 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014904 unicode_doc, /* tp_doc */
14905 0, /* tp_traverse */
14906 0, /* tp_clear */
14907 PyUnicode_RichCompare, /* tp_richcompare */
14908 0, /* tp_weaklistoffset */
14909 unicode_iter, /* tp_iter */
14910 0, /* tp_iternext */
14911 unicode_methods, /* tp_methods */
14912 0, /* tp_members */
14913 0, /* tp_getset */
14914 &PyBaseObject_Type, /* tp_base */
14915 0, /* tp_dict */
14916 0, /* tp_descr_get */
14917 0, /* tp_descr_set */
14918 0, /* tp_dictoffset */
14919 0, /* tp_init */
14920 0, /* tp_alloc */
14921 unicode_new, /* tp_new */
14922 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014923};
14924
14925/* Initialize the Unicode implementation */
14926
Victor Stinner3a50e702011-10-18 21:21:00 +020014927int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014928{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014929 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014930 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014931 0x000A, /* LINE FEED */
14932 0x000D, /* CARRIAGE RETURN */
14933 0x001C, /* FILE SEPARATOR */
14934 0x001D, /* GROUP SEPARATOR */
14935 0x001E, /* RECORD SEPARATOR */
14936 0x0085, /* NEXT LINE */
14937 0x2028, /* LINE SEPARATOR */
14938 0x2029, /* PARAGRAPH SEPARATOR */
14939 };
14940
Fred Drakee4315f52000-05-09 19:53:39 +000014941 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014942 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014943 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014944 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014945 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014946
Guido van Rossumcacfc072002-05-24 19:01:59 +000014947 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014948 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014949
14950 /* initialize the linebreak bloom filter */
14951 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014952 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014953 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014954
Christian Heimes26532f72013-07-20 14:57:16 +020014955 if (PyType_Ready(&EncodingMapType) < 0)
14956 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014957
Benjamin Petersonc4311282012-10-30 23:21:10 -040014958 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14959 Py_FatalError("Can't initialize field name iterator type");
14960
14961 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14962 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014963
Victor Stinner3a50e702011-10-18 21:21:00 +020014964 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014965}
14966
14967/* Finalize the Unicode implementation */
14968
Christian Heimesa156e092008-02-16 07:38:31 +000014969int
14970PyUnicode_ClearFreeList(void)
14971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014972 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014973}
14974
Guido van Rossumd57fd912000-03-10 22:53:23 +000014975void
Thomas Wouters78890102000-07-22 19:25:51 +000014976_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014977{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014978 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014979
Serhiy Storchaka05997252013-01-26 12:14:02 +020014980 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014981
Serhiy Storchaka05997252013-01-26 12:14:02 +020014982 for (i = 0; i < 256; i++)
14983 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014984 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014985 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014986}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014987
Walter Dörwald16807132007-05-25 13:52:07 +000014988void
14989PyUnicode_InternInPlace(PyObject **p)
14990{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014991 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014993#ifdef Py_DEBUG
14994 assert(s != NULL);
14995 assert(_PyUnicode_CHECK(s));
14996#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014997 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014998 return;
14999#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015000 /* If it's a subclass, we don't really know what putting
15001 it in the interned dict might do. */
15002 if (!PyUnicode_CheckExact(s))
15003 return;
15004 if (PyUnicode_CHECK_INTERNED(s))
15005 return;
15006 if (interned == NULL) {
15007 interned = PyDict_New();
15008 if (interned == NULL) {
15009 PyErr_Clear(); /* Don't leave an exception */
15010 return;
15011 }
15012 }
15013 /* It might be that the GetItem call fails even
15014 though the key is present in the dictionary,
15015 namely when this happens during a stack overflow. */
15016 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015017 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015018 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015019
Victor Stinnerf0335102013-04-14 19:13:03 +020015020 if (t) {
15021 Py_INCREF(t);
15022 Py_DECREF(*p);
15023 *p = t;
15024 return;
15025 }
Walter Dörwald16807132007-05-25 13:52:07 +000015026
Benjamin Peterson14339b62009-01-31 16:36:08 +000015027 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015028 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015029 PyErr_Clear();
15030 PyThreadState_GET()->recursion_critical = 0;
15031 return;
15032 }
15033 PyThreadState_GET()->recursion_critical = 0;
15034 /* The two references in interned are not counted by refcnt.
15035 The deallocator will take care of this */
15036 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015037 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015038}
15039
15040void
15041PyUnicode_InternImmortal(PyObject **p)
15042{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015043 PyUnicode_InternInPlace(p);
15044 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015045 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 Py_INCREF(*p);
15047 }
Walter Dörwald16807132007-05-25 13:52:07 +000015048}
15049
15050PyObject *
15051PyUnicode_InternFromString(const char *cp)
15052{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015053 PyObject *s = PyUnicode_FromString(cp);
15054 if (s == NULL)
15055 return NULL;
15056 PyUnicode_InternInPlace(&s);
15057 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015058}
15059
Alexander Belopolsky40018472011-02-26 01:02:56 +000015060void
15061_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015062{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015063 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015064 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 Py_ssize_t i, n;
15066 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015067
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 if (interned == NULL || !PyDict_Check(interned))
15069 return;
15070 keys = PyDict_Keys(interned);
15071 if (keys == NULL || !PyList_Check(keys)) {
15072 PyErr_Clear();
15073 return;
15074 }
Walter Dörwald16807132007-05-25 13:52:07 +000015075
Benjamin Peterson14339b62009-01-31 16:36:08 +000015076 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15077 detector, interned unicode strings are not forcibly deallocated;
15078 rather, we give them their stolen references back, and then clear
15079 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015080
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 n = PyList_GET_SIZE(keys);
15082 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015083 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015084 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015085 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015086 if (PyUnicode_READY(s) == -1) {
15087 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015088 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015090 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 case SSTATE_NOT_INTERNED:
15092 /* XXX Shouldn't happen */
15093 break;
15094 case SSTATE_INTERNED_IMMORTAL:
15095 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015096 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015097 break;
15098 case SSTATE_INTERNED_MORTAL:
15099 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015100 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 break;
15102 default:
15103 Py_FatalError("Inconsistent interned string state.");
15104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015105 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 }
15107 fprintf(stderr, "total size of all interned strings: "
15108 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15109 "mortal/immortal\n", mortal_size, immortal_size);
15110 Py_DECREF(keys);
15111 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015112 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015113}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015114
15115
15116/********************* Unicode Iterator **************************/
15117
15118typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 PyObject_HEAD
15120 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015121 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015122} unicodeiterobject;
15123
15124static void
15125unicodeiter_dealloc(unicodeiterobject *it)
15126{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 _PyObject_GC_UNTRACK(it);
15128 Py_XDECREF(it->it_seq);
15129 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015130}
15131
15132static int
15133unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 Py_VISIT(it->it_seq);
15136 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015137}
15138
15139static PyObject *
15140unicodeiter_next(unicodeiterobject *it)
15141{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015142 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015143
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 assert(it != NULL);
15145 seq = it->it_seq;
15146 if (seq == NULL)
15147 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015148 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015150 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15151 int kind = PyUnicode_KIND(seq);
15152 void *data = PyUnicode_DATA(seq);
15153 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15154 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 if (item != NULL)
15156 ++it->it_index;
15157 return item;
15158 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015159
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 Py_DECREF(seq);
15161 it->it_seq = NULL;
15162 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015163}
15164
15165static PyObject *
15166unicodeiter_len(unicodeiterobject *it)
15167{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015168 Py_ssize_t len = 0;
15169 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015170 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015171 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015172}
15173
15174PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15175
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015176static PyObject *
15177unicodeiter_reduce(unicodeiterobject *it)
15178{
15179 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015180 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015181 it->it_seq, it->it_index);
15182 } else {
15183 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15184 if (u == NULL)
15185 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015186 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015187 }
15188}
15189
15190PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15191
15192static PyObject *
15193unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15194{
15195 Py_ssize_t index = PyLong_AsSsize_t(state);
15196 if (index == -1 && PyErr_Occurred())
15197 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015198 if (it->it_seq != NULL) {
15199 if (index < 0)
15200 index = 0;
15201 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15202 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15203 it->it_index = index;
15204 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015205 Py_RETURN_NONE;
15206}
15207
15208PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15209
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015210static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015212 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015213 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15214 reduce_doc},
15215 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15216 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015217 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015218};
15219
15220PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015221 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15222 "str_iterator", /* tp_name */
15223 sizeof(unicodeiterobject), /* tp_basicsize */
15224 0, /* tp_itemsize */
15225 /* methods */
15226 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15227 0, /* tp_print */
15228 0, /* tp_getattr */
15229 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015230 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015231 0, /* tp_repr */
15232 0, /* tp_as_number */
15233 0, /* tp_as_sequence */
15234 0, /* tp_as_mapping */
15235 0, /* tp_hash */
15236 0, /* tp_call */
15237 0, /* tp_str */
15238 PyObject_GenericGetAttr, /* tp_getattro */
15239 0, /* tp_setattro */
15240 0, /* tp_as_buffer */
15241 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15242 0, /* tp_doc */
15243 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15244 0, /* tp_clear */
15245 0, /* tp_richcompare */
15246 0, /* tp_weaklistoffset */
15247 PyObject_SelfIter, /* tp_iter */
15248 (iternextfunc)unicodeiter_next, /* tp_iternext */
15249 unicodeiter_methods, /* tp_methods */
15250 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015251};
15252
15253static PyObject *
15254unicode_iter(PyObject *seq)
15255{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015257
Benjamin Peterson14339b62009-01-31 16:36:08 +000015258 if (!PyUnicode_Check(seq)) {
15259 PyErr_BadInternalCall();
15260 return NULL;
15261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015262 if (PyUnicode_READY(seq) == -1)
15263 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015264 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15265 if (it == NULL)
15266 return NULL;
15267 it->it_index = 0;
15268 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015269 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 _PyObject_GC_TRACK(it);
15271 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015272}
15273
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015274
15275size_t
15276Py_UNICODE_strlen(const Py_UNICODE *u)
15277{
15278 int res = 0;
15279 while(*u++)
15280 res++;
15281 return res;
15282}
15283
15284Py_UNICODE*
15285Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15286{
15287 Py_UNICODE *u = s1;
15288 while ((*u++ = *s2++));
15289 return s1;
15290}
15291
15292Py_UNICODE*
15293Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15294{
15295 Py_UNICODE *u = s1;
15296 while ((*u++ = *s2++))
15297 if (n-- == 0)
15298 break;
15299 return s1;
15300}
15301
15302Py_UNICODE*
15303Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15304{
15305 Py_UNICODE *u1 = s1;
15306 u1 += Py_UNICODE_strlen(u1);
15307 Py_UNICODE_strcpy(u1, s2);
15308 return s1;
15309}
15310
15311int
15312Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15313{
15314 while (*s1 && *s2 && *s1 == *s2)
15315 s1++, s2++;
15316 if (*s1 && *s2)
15317 return (*s1 < *s2) ? -1 : +1;
15318 if (*s1)
15319 return 1;
15320 if (*s2)
15321 return -1;
15322 return 0;
15323}
15324
15325int
15326Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15327{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015328 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015329 for (; n != 0; n--) {
15330 u1 = *s1;
15331 u2 = *s2;
15332 if (u1 != u2)
15333 return (u1 < u2) ? -1 : +1;
15334 if (u1 == '\0')
15335 return 0;
15336 s1++;
15337 s2++;
15338 }
15339 return 0;
15340}
15341
15342Py_UNICODE*
15343Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15344{
15345 const Py_UNICODE *p;
15346 for (p = s; *p; p++)
15347 if (*p == c)
15348 return (Py_UNICODE*)p;
15349 return NULL;
15350}
15351
15352Py_UNICODE*
15353Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15354{
15355 const Py_UNICODE *p;
15356 p = s + Py_UNICODE_strlen(s);
15357 while (p != s) {
15358 p--;
15359 if (*p == c)
15360 return (Py_UNICODE*)p;
15361 }
15362 return NULL;
15363}
Victor Stinner331ea922010-08-10 16:37:20 +000015364
Victor Stinner71133ff2010-09-01 23:43:53 +000015365Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015366PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015367{
Victor Stinner577db2c2011-10-11 22:12:48 +020015368 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015369 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015371 if (!PyUnicode_Check(unicode)) {
15372 PyErr_BadArgument();
15373 return NULL;
15374 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015375 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015376 if (u == NULL)
15377 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015378 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015379 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015380 PyErr_NoMemory();
15381 return NULL;
15382 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015383 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015384 size *= sizeof(Py_UNICODE);
15385 copy = PyMem_Malloc(size);
15386 if (copy == NULL) {
15387 PyErr_NoMemory();
15388 return NULL;
15389 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015390 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015391 return copy;
15392}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015393
Georg Brandl66c221e2010-10-14 07:04:07 +000015394/* A _string module, to export formatter_parser and formatter_field_name_split
15395 to the string.Formatter class implemented in Python. */
15396
15397static PyMethodDef _string_methods[] = {
15398 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15399 METH_O, PyDoc_STR("split the argument as a field name")},
15400 {"formatter_parser", (PyCFunction) formatter_parser,
15401 METH_O, PyDoc_STR("parse the argument as a format string")},
15402 {NULL, NULL}
15403};
15404
15405static struct PyModuleDef _string_module = {
15406 PyModuleDef_HEAD_INIT,
15407 "_string",
15408 PyDoc_STR("string helper module"),
15409 0,
15410 _string_methods,
15411 NULL,
15412 NULL,
15413 NULL,
15414 NULL
15415};
15416
15417PyMODINIT_FUNC
15418PyInit__string(void)
15419{
15420 return PyModule_Create(&_string_module);
15421}
15422
15423
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015424#ifdef __cplusplus
15425}
15426#endif